Python sample_batch_index 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: utils

메소드/함수: sample_batch_index

hotexamples.com에서의 예제들: 9

Python sample_batch_index - 9개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 utils.sample_batch_index에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

예제 #1

파일 보기

def gain(data_x, feature_name, onehotencoder, ori_data_dim, gain_parameters):
    '''Impute missing values in data_x
  
  Args:
    - data_x: original data with missing values
    - feature_name: feature namelist of original data
    - onehotencoder: onehotencoder of this data
    - ori_data_dim: dimensions of original data    
    - gain_parameters: GAIN network parameters:
      - data_name: the file name of dataset
      - batch_size: Batch size
      - hint_rate: Hint rate
      - alpha: Hyperparameter
      - iterations: Iterations
      - onehot: the number of feature for onehot encoder (start from first feature)
      - predict: option for prediction mode
      
  Returns:
    - imputed_data: imputed data
  '''
    # Define mask matrix
    data_m = 1 - np.isnan(data_x)

    # System parameters
    data_name = gain_parameters['data_name']
    batch_size = gain_parameters['batch_size']
    hint_rate = gain_parameters['hint_rate']
    alpha = gain_parameters['alpha']
    iterations = gain_parameters['iterations']
    onehot = gain_parameters['onehot']
    predict = gain_parameters['predict']

    # Model Path
    model_path = 'model/' + data_name

    # Other parameters
    no, dim = data_x.shape

    # Hidden state dimensions
    h_dim = int(dim)

    # Normalization
    norm_data, norm_parameters = normalization(data_x)
    norm_data_x = np.nan_to_num(norm_data, 0)

    ## GAIN architecture
    # Input placeholders
    # Data vector q
    X = tf.placeholder(tf.float32, shape=[None, dim], name='X')
    # Mask vector
    M = tf.placeholder(tf.float32, shape=[None, dim], name='M')
    # Hint vector
    H = tf.placeholder(tf.float32, shape=[None, dim], name='H')

    # Discriminator variables
    D_W1 = tf.Variable(xavier_init([dim * 2, h_dim]),
                       name='D_W1')  # Data + Hint as inputs
    D_b1 = tf.Variable(tf.zeros(shape=[h_dim]), name='D_b1')

    D_W2 = tf.Variable(xavier_init([h_dim, h_dim]), name='D_W2')
    D_b2 = tf.Variable(tf.zeros(shape=[h_dim]), name='D_b2')

    D_W3 = tf.Variable(xavier_init([h_dim, dim]), name='D_W3')
    D_b3 = tf.Variable(tf.zeros(shape=[dim]),
                       name='D_b3')  # Multi-variate outputs

    theta_D = [D_W1, D_W2, D_W3, D_b1, D_b2, D_b3]

    #Generator variables
    # Data + Mask as inputs (Random noise is in missing components)
    G_W1 = tf.Variable(xavier_init([dim * 2, h_dim]), name='G_W1')
    G_b1 = tf.Variable(tf.zeros(shape=[h_dim]), name='G_b1')

    G_W2 = tf.Variable(xavier_init([h_dim, h_dim]), name='G_W2')
    G_b2 = tf.Variable(tf.zeros(shape=[h_dim]), name='G_b2')

    G_W3 = tf.Variable(xavier_init([h_dim, dim]), name='G_W3')
    G_b3 = tf.Variable(tf.zeros(shape=[dim]), name='G_b3')

    theta_G = [G_W1, G_W2, G_W3, G_b1, G_b2, G_b3]

    ## GAIN functions
    # Generator
    def generator(x, m):
        # Concatenate Mask and Data
        inputs = tf.concat(values=[x, m], axis=1)
        G_h1 = tf.nn.relu(tf.matmul(inputs, G_W1) + G_b1)
        G_h2 = tf.nn.relu(tf.matmul(G_h1, G_W2) + G_b2)
        # MinMax normalized output
        G_prob = tf.nn.sigmoid(tf.matmul(G_h2, G_W3) + G_b3)
        return G_prob

    # Discriminator
    def discriminator(x, h):
        # Concatenate Data and Hint
        inputs = tf.concat(values=[x, h], axis=1)
        D_h1 = tf.nn.relu(tf.matmul(inputs, D_W1) + D_b1)
        D_h2 = tf.nn.relu(tf.matmul(D_h1, D_W2) + D_b2)
        D_logit = tf.matmul(D_h2, D_W3) + D_b3
        D_prob = tf.nn.sigmoid(D_logit)
        return D_prob

    ## GAIN structure
    # Generator
    G_sample = generator(X, M)

    # Combine with observed data
    Hat_X = X * M + G_sample * (1 - M)

    # Discriminator
    D_prob = discriminator(Hat_X, H)

    ## GAIN loss
    D_loss_temp = -tf.reduce_mean(M * tf.log(D_prob + 1e-8) \
                                  + (1-M) * tf.log(1. - D_prob + 1e-8))

    G_loss_temp = -tf.reduce_mean((1 - M) * tf.log(D_prob + 1e-8))

    MSE_loss = \
    tf.reduce_mean((M * X - M * G_sample)**2) / tf.reduce_mean(M)

    D_loss = D_loss_temp
    G_loss = G_loss_temp + alpha * MSE_loss

    ## GAIN solver
    D_solver = tf.train.AdamOptimizer().minimize(D_loss, var_list=theta_D)
    G_solver = tf.train.AdamOptimizer().minimize(G_loss, var_list=theta_G)

    ## Iterations
    sess = tf.Session()
    saver = tf.train.Saver()
    if predict is True and os.path.exists(model_path + '.ckpt.meta'):
        print("Model Restore")
        saver.restore(sess, model_path + '.ckpt')
    else:
        sess.run(tf.global_variables_initializer())

    # Start Iterations
    for it in tqdm(range(iterations)):

        # Sample batch
        batch_idx = sample_batch_index(no, batch_size)
        X_mb = norm_data_x[batch_idx, :]
        M_mb = data_m[batch_idx, :]
        # Sample random vectors
        Z_mb = uniform_sampler(0, 0.01, batch_size, dim)
        # Sample hint vectors
        H_mb_temp = binary_sampler(hint_rate, batch_size, dim)
        H_mb = M_mb * H_mb_temp

        # Combine random vectors with observed vectors
        X_mb = M_mb * X_mb + (1 - M_mb) * Z_mb

        _, D_loss_curr = sess.run([D_solver, D_loss_temp],
                                  feed_dict={
                                      M: M_mb,
                                      X: X_mb,
                                      H: H_mb
                                  })
        _, G_loss_curr, MSE_loss_curr = \
        sess.run([G_solver, G_loss_temp, MSE_loss],
                 feed_dict = {X: X_mb, M: M_mb, H: H_mb})
    if predict is False:
        save_path = saver.save(sess, model_path + '.ckpt')

    ## Return imputed data
    Z_mb = uniform_sampler(0, 0.01, no, dim)
    M_mb = data_m
    X_mb = norm_data_x
    X_mb = M_mb * X_mb + (1 - M_mb) * Z_mb

    imputed_data = sess.run([G_sample], feed_dict={X: X_mb, M: M_mb})[0]

    imputed_data = data_m * norm_data_x + (1 - data_m) * imputed_data

    # Renormalization
    imputed_data = renormalization(imputed_data, norm_parameters)

    # Rounding
    imputed_data = rounding(imputed_data, data_x)

    # Reverse encoding
    if onehot > 0:
        imputed_data = reverse_encoding(imputed_data, feature_name,
                                        onehotencoder, onehot, ori_data_dim)

    return imputed_data

예제 #2

파일 보기

파일: CPH.py 프로젝트: WoodScene/Compressive-Population-Health

def cph(data_x, cph_parameters, data_image):
    seed = 25
    random.seed(seed)
    np.random.seed(seed)
    tf.set_random_seed(seed)
    '''Impute missing values in data_x
  
  Args:
    - data_x: original data with missing values
    - parameters: CPH network parameters:
      - batch_size: Batch size
      - hint_rate: Hint rate
      - alpha: Hyperparameter
      - iterations: Iterations
      
  Returns:
    - imputed_data: imputed data
  '''
    # Define mask matrix
    data_m = 1 - np.isnan(data_x)

    # System parameters
    batch_size = cph_parameters['batch_size']
    hint_rate = cph_parameters['hint_rate']
    alpha = cph_parameters['alpha']
    iterations = cph_parameters['iterations']

    # Other parameters
    no, dim = data_x.shape

    # Hidden state dimensions
    h_dim = int(dim)
    #print(h_dim)

    # Normalization
    norm_data, norm_parameters = normalization(data_x)
    #norm_data_x = np.nan_to_num(norm_data, 0)
    norm_data_x = np.nan_to_num(data_x, 0)

    ## CPH architecture
    # Input placeholders
    X_pre = tf.placeholder(tf.float32, shape=[1, 483, dim, 3])
    # Data vector
    #X = tf.placeholder(tf.float32, shape = [None, dim])
    # Mask vector
    M = tf.placeholder(tf.float32, shape=[None, dim])
    # Hint vector
    H = tf.placeholder(tf.float32, shape=[None, dim])

    # Discriminator variables
    D_W1 = tf.Variable(xavier_init([dim * 2, h_dim]))  # Data + Hint as inputs
    D_b1 = tf.Variable(tf.zeros(shape=[h_dim]))

    D_W2 = tf.Variable(xavier_init([h_dim, h_dim]))
    D_b2 = tf.Variable(tf.zeros(shape=[h_dim]))

    D_W3 = tf.Variable(xavier_init([h_dim, dim]))
    D_b3 = tf.Variable(tf.zeros(shape=[dim]))  # Multi-variate outputs

    theta_D = [D_W1, D_W2, D_W3, D_b1, D_b2, D_b3]

    #Generator variables
    conv_filter_w1 = tf.Variable(tf.random_normal([1, 4, 3, 3]))
    conv_filter_b1 = tf.Variable(tf.random_normal([3]))

    conv_filter_w2 = tf.Variable(tf.random_normal([1, 4, 3, 1]))
    conv_filter_b2 = tf.Variable(tf.random_normal([1]))
    # Data + Mask as inputs (Random noise is in missing components)
    G_W1 = tf.Variable(xavier_init([dim * 2, h_dim]))
    G_b1 = tf.Variable(tf.zeros(shape=[h_dim]))

    G_W2 = tf.Variable(xavier_init([h_dim, h_dim]))
    G_b2 = tf.Variable(tf.zeros(shape=[h_dim]))

    G_W3 = tf.Variable(xavier_init([h_dim, dim]))
    G_b3 = tf.Variable(tf.zeros(shape=[dim]))

    theta_G = [
        G_W1, G_W2, G_W3, G_b1, G_b2, G_b3, conv_filter_w1, conv_filter_b1,
        conv_filter_w2, conv_filter_b2
    ]

    ## CPH functions
    # CNN + Generator
    def generator(x, m):
        relu_feature_maps1 = tf.nn.relu( \
          tf.nn.conv2d(x, conv_filter_w1, strides=[1, 1, 1, 1], padding='SAME') + conv_filter_b1)
        max_pool1 = tf.nn.max_pool(relu_feature_maps1,
                                   ksize=[1, 1, 4, 1],
                                   strides=[1, 1, 1, 1],
                                   padding='SAME')

        relu_feature_maps2 = tf.nn.relu( \
          tf.nn.conv2d(max_pool1, conv_filter_w2, strides=[1, 1, 1, 1], padding='SAME') + conv_filter_b2)
        max_pool2 = tf.nn.max_pool(relu_feature_maps2,
                                   ksize=[1, 1, 4, 1],
                                   strides=[1, 1, 1, 1],
                                   padding='SAME')

        x2 = tf.reshape(max_pool2, [483, dim])

        # Concatenate Mask and Data
        inputs = tf.concat(values=[x2, m], axis=1)
        G_h1 = tf.nn.relu(tf.matmul(inputs, G_W1) + G_b1)
        G_h2 = tf.nn.relu(tf.matmul(G_h1, G_W2) + G_b2)
        # MinMax normalized output
        G_prob = tf.nn.sigmoid(tf.matmul(G_h2, G_W3) + G_b3)
        return G_prob

    # Discriminator
    def discriminator(x, h):
        # Concatenate Data and Hint
        inputs = tf.concat(values=[x, h], axis=1)
        D_h1 = tf.nn.relu(tf.matmul(inputs, D_W1) + D_b1)
        D_h2 = tf.nn.relu(tf.matmul(D_h1, D_W2) + D_b2)
        D_logit = tf.matmul(D_h2, D_W3) + D_b3
        D_prob = tf.nn.sigmoid(D_logit)
        return D_prob

    ## CPH structure
    # Generator
    G_sample = generator(X_pre, M)
    X2 = X_pre[0, :, :, 0]
    # Combine with observed data
    Hat_X = X2 * M + G_sample * (1 - M)

    # Discriminator
    D_prob = discriminator(Hat_X, H)

    ## CPH loss
    D_loss_temp = -tf.reduce_mean(M * tf.log(D_prob + 1e-8) \
                                  + (1-M) * tf.log(1. - D_prob + 1e-8))

    G_loss_temp = -tf.reduce_mean((1 - M) * tf.log(D_prob + 1e-8))

    MSE_loss = \
    tf.reduce_mean((M * X2 - M * G_sample)**2) / tf.reduce_mean(M)

    D_loss = D_loss_temp
    G_loss = G_loss_temp + alpha * MSE_loss

    ## CPH solver
    D_solver = tf.train.AdamOptimizer().minimize(D_loss, var_list=theta_D)
    G_solver = tf.train.AdamOptimizer().minimize(G_loss, var_list=theta_G)

    ## Iterations
    sess = tf.Session()
    sess.run(tf.global_variables_initializer())

    # Start Iterations
    for it in tqdm(range(iterations)):

        # Sample batch
        batch_idx = sample_batch_index(no, batch_size)
        #print(len(batch_idx))
        image_mb = data_image[:, batch_idx, :, :]
        X_mb = norm_data_x[batch_idx, :]
        M_mb = data_m[batch_idx, :]
        # Sample random vectors
        Z_mb = uniform_sampler(0, 0.01, batch_size, dim)
        # Sample hint vectors
        H_mb_temp = binary_sampler(hint_rate, batch_size, dim)

        H_mb = M_mb * H_mb_temp
        # Combine random vectors with observed vectors
        X_mb = M_mb * X_mb + (1 - M_mb) * Z_mb
        image_mb[0, :, :, 0] = X_mb

        _, D_loss_curr = sess.run([D_solver, D_loss_temp],
                                  feed_dict={
                                      M: M_mb,
                                      X_pre: image_mb,
                                      H: H_mb
                                  })
        _, G_loss_curr, MSE_loss_curr = \
        sess.run([G_solver, G_loss_temp, MSE_loss],
                 feed_dict = {X_pre: image_mb, M: M_mb, H: H_mb})

    ## Return imputed data
    Z_mb = uniform_sampler(0, 0.01, no, dim)
    M_mb = data_m
    X_mb = norm_data_x
    X_mb = M_mb * X_mb + (1 - M_mb) * Z_mb
    image_mb = data_image
    image_mb[0, :, :, 0] = X_mb

    imputed_data = sess.run([G_sample], feed_dict={
        X_pre: image_mb,
        M: M_mb
    })[0]

    imputed_data = data_m * norm_data_x + (1 - data_m) * imputed_data

    # Renormalization
    #imputed_data = renormalization(imputed_data, norm_parameters)

    # Rounding
    imputed_data = rounding(imputed_data, data_x)

    return imputed_data

예제 #3

파일 보기

def PC_GAIN (incomplete_data_x , gain_parameters , data_m):
    '''Impute missing values in incomplete_data_x

    Args:
    - incomplete_data_x: original data with missing values
    - gain_parameters: PC_GAIN network parameters:
    - batch_size: Batch size，64
    - hint_rate: Hint rate，0.9
    - alpha: Hyperparameter，200
    - beta: Hyperparameter，20
    - lambda_: Hyperparameter，0.2
    - k: Hyperparameter，4
    - iterations: Iterations，10000

    Returns:
    - imputed_data: imputed data
    '''
    # System parameters
    batch_size = gain_parameters['batch_size']
    hint_rate = gain_parameters['hint_rate']
    alpha = gain_parameters['alpha']
    beta = gain_parameters['beta']
    lambda_ = gain_parameters['lambda_']
    k = gain_parameters['k']
    iterations = gain_parameters['iterations']
    cluster_species = gain_parameters['cluster_species']
    
    # Other parameters
    no, dim = incomplete_data_x.shape
    # Hidden state dimensions
    h_dim = int(dim)
    # Normalization
    norm_data , norm_parameters = normalization(incomplete_data_x)
    norm_data_x = np.nan_to_num(norm_data, 0)


    ## PC_GAIN architecture   
    X = tf.placeholder(tf.float32, shape = [None, dim])
    M = tf.placeholder(tf.float32, shape = [None, dim])
    H = tf.placeholder(tf.float32, shape = [None, dim])

    Z = tf.placeholder(tf.float32, shape = [None, dim])
    Y = tf.placeholder(tf.float32, shape = [None, k])
    
    # Discriminator variables
    D_W1 = tf.Variable(xavier_init([dim*2, h_dim])) # Data + Hint as inputs
    D_b1 = tf.Variable(tf.zeros(shape = [h_dim]))
    D_W2 = tf.Variable(xavier_init([h_dim, h_dim]))
    D_b2 = tf.Variable(tf.zeros(shape = [h_dim]))
    D_W3 = tf.Variable(xavier_init([h_dim, dim]))
    D_b3 = tf.Variable(tf.zeros(shape = [dim]))  # Multi-variate outputs
    theta_D = [D_W1, D_W2, D_W3, D_b1, D_b2, D_b3]

    #Generator variables
    # Data + Mask as inputs (Random noise is in missing components)
    G_W1 = tf.Variable(xavier_init([dim*2, h_dim]))  
    G_b1 = tf.Variable(tf.zeros(shape = [h_dim]))
    G_W2 = tf.Variable(xavier_init([h_dim, h_dim]))
    G_b2 = tf.Variable(tf.zeros(shape = [h_dim]))
    G_W3 = tf.Variable(xavier_init([h_dim, dim]))
    G_b3 = tf.Variable(tf.zeros(shape = [dim]))
    theta_G = [G_W1, G_W2, G_W3, G_b1, G_b2, G_b3]

    C_W1 = tf.Variable(xavier_init([dim, h_dim]))
    C_b1 = tf.Variable(tf.zeros(shape = [h_dim]))
    C_W2 = tf.Variable(xavier_init([h_dim, h_dim]))
    C_b2 = tf.Variable(tf.zeros(shape = [h_dim]))
    C_W3 = tf.Variable(xavier_init([h_dim, k]))
    C_b3 = tf.Variable(tf.zeros(shape = [k]))  # 分类器
    theta_C = [C_W1, C_b1, C_W2, C_b2, C_W3, C_b3]
  
    ## PC_GAIN functions
    # Generator
    def generator(x,m):
        # Concatenate Mask and Data
        inputs = tf.concat(values = [x, m], axis = 1) 
        G_h1 = tf.nn.relu(tf.matmul(inputs, G_W1) + G_b1)
        G_h2 = tf.nn.relu(tf.matmul(G_h1, G_W2) + G_b2)   
        # MinMax normalized output
        G_prob = tf.nn.sigmoid(tf.matmul(G_h2, G_W3) + G_b3) 
        return G_prob
      
    # Discriminator
    def discriminator(x, h):
        # Concatenate Data and Hint
        inputs = tf.concat(values = [x, h], axis = 1)
        D_h1 = tf.nn.relu(tf.matmul(inputs, D_W1) + D_b1)
        D_h2 = tf.nn.relu(tf.matmul(D_h1, D_W2) + D_b2)
        D_logit = tf.matmul(D_h2, D_W3) + D_b3
        D_prob = tf.nn.sigmoid(D_logit)
        return D_prob, D_logit
    
    # Classer (neural network classifier mentioned in the paper)
    def classer(feature):
        C_h1 = tf.nn.relu(tf.matmul(feature, C_W1) + C_b1)
        C_h2 = tf.nn.relu(tf.matmul(C_h1, C_W2) + C_b2)
        C_h3 = tf.matmul(C_h2, C_W3) + C_b3
        C_prob = tf.nn.softmax(C_h3)
        return C_prob  
  
    ## PC_GAIN structure
    # Generator
    G_sample = generator(X, M)

    # Combine with observed data
    Hat_X = X * M + G_sample * (1-M)

    # Discriminator
    D_prob, D_logit = discriminator(Hat_X, H)
    
    ## PC_GAIN loss
    D_loss_temp = -tf.reduce_mean(M * tf.log(D_prob + 1e-8) + (1-M) * tf.log(1. - D_prob + 1e-8))
    G_loss_temp = -tf.reduce_mean((1-M) * tf.log(D_prob + 1e-8))
    G_loss_with_C = -tf.reduce_mean(Y * tf.log(Y + 1e-8))
    MSE_loss = tf.reduce_mean((M * X - M * G_sample) * (M * X - M * G_sample)) / tf.reduce_mean(M)

    D_loss = D_loss_temp
    G_loss_pre = G_loss_temp + alpha * MSE_loss
    G_loss = G_loss_temp + alpha * MSE_loss + beta * G_loss_with_C
    
    ## PC_GAIN solver
    D_solver = tf.train.AdamOptimizer().minimize(D_loss, var_list=theta_D)
    G_solver_pre = tf.train.AdamOptimizer().minimize(G_loss_pre, var_list=theta_G)
    G_solver = tf.train.AdamOptimizer().minimize(G_loss, var_list=theta_G)
    
    ## Iterations
    init = tf.global_variables_initializer()
    with tf.Session() as sess:
        sess.run(init)
        
        ##Select pre-training data
        loss_rate = []
        for i in range(no):
            index = 0
            for j in range(dim):
                if data_m[i,j] == 0:
                    index = index + 1
            loss_rate.append([index , i])
        loss_rate = sorted(loss_rate,key=(lambda x:x[0]))
        no_x_L = int(no * lambda_) 
        index_x_L = []
        for i in range(no_x_L):
            index_x_L.append(loss_rate[i][1])
        norm_data_x_L = norm_data_x[index_x_L, :]
        data_m_L = data_m[index_x_L, :]
        
        ##Pre-training
        print('...Pre-training')
        for it in tqdm(range(int(iterations * 0.7))):
            batch_idx = sample_batch_index(no_x_L, batch_size)
            X_mb = norm_data_x_L[batch_idx, :]
            M_mb = data_m_L[batch_idx, :]
            Z_mb = uniform_sampler(0, 0.01, batch_size, dim)
            H_mb_temp = binary_sampler(hint_rate, batch_size, dim)
            H_mb = M_mb * H_mb_temp
            X_mb = M_mb * X_mb + (1-M_mb) * Z_mb
            _, D_loss_curr, D_logit_curr, D_prob_curr = sess.run([D_solver, D_loss_temp, D_logit, D_prob], feed_dict = {M: M_mb, X: X_mb, H:H_mb})
            _, G_loss_curr, MSE_loss_curr = sess.run([G_solver_pre, G_loss_temp, MSE_loss], feed_dict = {X: X_mb, M: M_mb, H:H_mb})
            
        Z_mb = uniform_sampler(0, 0.01, no_x_L, dim) 
        M_mb = data_m_L
        X_mb = norm_data_x_L
        X_mb = M_mb * X_mb + (1-M_mb) * Z_mb 
        imputed_data_L = sess.run([G_sample], feed_dict = {X: X_mb, M: M_mb})[0]
        imputed_data_L = data_m_L * norm_data_x_L + (1 - data_m_L) * imputed_data_L
        
        ## Select different clustering methods
        if cluster_species == 'KM':
            data_c , data_class = KM(imputed_data_L, k)
        elif cluster_species == 'SC':
            data_c , data_class = SC(imputed_data_L, k)
        elif cluster_species == 'AC':
            data_c , data_class = AC(imputed_data_L, k)
        elif cluster_species == 'KMPP': 
            data_c , data_class = KMPP(imputed_data_L, k)         
        else:
            exit('have not this cluster methods')
        
        ## Pseudo-label training multi-classification SVM
        ## You can also choose other classifiers, 
        ## such as the neural network classifier mentioned in the paper
        coder = preprocessing.OneHotEncoder()
        model = svm.SVC(kernel="linear", decision_function_shape="ovo")   
        coder.fit(data_class.reshape(-1,1))
        model.fit(imputed_data_L, data_class)
        
        ## Updata the generator G and the discriminator D
        ## To avoid the effects of pre-training, 
        ## you can also choose to reinitialize the generator parameters
        for it in tqdm(range(iterations)):
            batch_idx = sample_batch_index(no, batch_size)
            X_mb = norm_data_x[batch_idx, :]
            M_mb = data_m[batch_idx, :]
            Z_mb = uniform_sampler(0, 0.01, batch_size, dim)
            H_mb_temp = binary_sampler(hint_rate, batch_size, dim)
            H_mb = M_mb * H_mb_temp
            X_mb = M_mb * X_mb + (1-M_mb) * Z_mb
            _, D_loss_curr, D_logit_curr, D_prob_curr = sess.run([D_solver, D_loss_temp, D_logit, D_prob], feed_dict = {M: M_mb, X: X_mb, H:H_mb})
            
            ## Introducing pseudo label supervision
            Hat_X_curr = sess.run(Hat_X, feed_dict = {X: X_mb, M: M_mb, H:H_mb})
            y_pred = model.predict(Hat_X_curr)
            sample_prob = coder.transform(y_pred.reshape(-1,1)).toarray()  
            
            _, G_loss_curr, MSE_loss_curr , G_loss_with_C_curr = sess.run([G_solver, G_loss_temp, MSE_loss, G_loss_with_C], feed_dict = {X: X_mb, M: M_mb, H:H_mb , Y:sample_prob})
            
        ## Return imputed data 
        Z_mb = uniform_sampler(0, 0.01, no, dim) 
        M_mb = data_m
        X_mb = norm_data_x          
        X_mb = M_mb * X_mb + (1-M_mb) * Z_mb 
        imputed_data = sess.run([G_sample], feed_dict = {X: X_mb, M: M_mb})[0]
        imputed_data = data_m * norm_data_x + (1-data_m) * imputed_data
        imputed_data = renormalization(imputed_data, norm_parameters)
    return imputed_data

예제 #4

파일 보기

파일: gain.py 프로젝트: lzhtan/LossSight

def gain(data_x, gain_parameters):
    '''Impute missing values in data_x
  
  Args:
    - data_x: original data with missing values
    - gain_parameters: GAIN network parameters:
      - batch_size: Batch size
      - hint_rate: Hint rate
      - alpha: Hyperparameter
      - iterations: Iterations
      
  Returns:
    - imputed_data: imputed data
  '''
    # Define mask matrix
    data_m = 1 - np.isnan(data_x)

    # System parameters
    batch_size = gain_parameters['batch_size']
    hint_rate = gain_parameters['hint_rate']
    alpha = gain_parameters['alpha']
    iterations = gain_parameters['iterations']

    # Other parameters
    no, dim = data_x.shape

    # Hidden state dimensions
    h_dim = int(dim)

    # Normalization
    norm_data, norm_parameters = normalization(data_x)
    norm_data_x = np.nan_to_num(norm_data, 0)

    ## GAIN architecture
    # Input placeholders
    # Data vector
    X = tf.placeholder(tf.float32, shape=[None, dim])
    # Mask vector
    M = tf.placeholder(tf.float32, shape=[None, dim])
    # Hint vector
    H = tf.placeholder(tf.float32, shape=[None, dim])

    # Discriminator variables
    D_W1 = tf.Variable(xavier_init([dim * 2, h_dim]))  # Data + Hint as inputs
    D_b1 = tf.Variable(tf.zeros(shape=[h_dim]))

    D_W2 = tf.Variable(xavier_init([h_dim, h_dim]))
    D_b2 = tf.Variable(tf.zeros(shape=[h_dim]))

    D_W3 = tf.Variable(xavier_init([h_dim, dim]))
    D_b3 = tf.Variable(tf.zeros(shape=[dim]))  # Multi-variate outputs

    theta_D = [D_W1, D_W2, D_W3, D_b1, D_b2, D_b3]

    #Generator variables
    # Data + Mask as inputs (Random noise is in missing components)
    G_W1 = tf.Variable(xavier_init([dim * 2, h_dim]))
    G_b1 = tf.Variable(tf.zeros(shape=[h_dim]))

    G_W2 = tf.Variable(xavier_init([h_dim, h_dim]))
    G_b2 = tf.Variable(tf.zeros(shape=[h_dim]))

    G_W3 = tf.Variable(xavier_init([h_dim, dim]))
    G_b3 = tf.Variable(tf.zeros(shape=[dim]))

    theta_G = [G_W1, G_W2, G_W3, G_b1, G_b2, G_b3]

    ## GAIN functions
    # Generator
    def generator(x, m):
        # Concatenate Mask and Data
        inputs = tf.concat(values=[x, m], axis=1)
        G_h1 = tf.nn.relu(tf.matmul(inputs, G_W1) + G_b1)
        G_h2 = tf.nn.relu(tf.matmul(G_h1, G_W2) + G_b2)
        # MinMax normalized output
        G_prob = tf.nn.sigmoid(tf.matmul(G_h2, G_W3) + G_b3)
        return G_prob

    # Discriminator
    def discriminator(x, h):
        # Concatenate Data and Hint
        inputs = tf.concat(values=[x, h], axis=1)
        D_h1 = tf.nn.relu(tf.matmul(inputs, D_W1) + D_b1)
        D_h2 = tf.nn.relu(tf.matmul(D_h1, D_W2) + D_b2)
        D_logit = tf.matmul(D_h2, D_W3) + D_b3
        D_prob = tf.nn.sigmoid(D_logit)
        return D_prob

    ## GAIN structure
    # Generator
    G_sample = generator(X, M)

    # Combine with observed data
    Hat_X = X * M + G_sample * (1 - M)

    # Discriminator
    D_prob = discriminator(Hat_X, H)

    ## GAIN loss
    D_loss_temp = -tf.reduce_mean(M * tf.log(D_prob + 1e-8) \
                                  + (1-M) * tf.log(1. - D_prob + 1e-8))

    G_loss_temp = -tf.reduce_mean((1 - M) * tf.log(D_prob + 1e-8))

    MSE_loss = \
    tf.reduce_mean((M * X - M * G_sample)**2) / tf.reduce_mean(M)

    D_loss = D_loss_temp
    G_loss = G_loss_temp + alpha * MSE_loss

    ## GAIN solver
    D_solver = tf.train.AdamOptimizer().minimize(D_loss, var_list=theta_D)
    G_solver = tf.train.AdamOptimizer().minimize(G_loss, var_list=theta_G)

    ## Iterations
    sess = tf.Session()
    sess.run(tf.global_variables_initializer())

    # Start Iterations
    for it in tqdm(range(iterations)):

        # Sample batch
        batch_idx = sample_batch_index(no, batch_size)
        X_mb = norm_data_x[batch_idx, :]
        M_mb = data_m[batch_idx, :]
        # Sample random vectors
        Z_mb = uniform_sampler(0, 0.01, batch_size, dim)
        # Sample hint vectors
        H_mb_temp = binary_sampler(hint_rate, batch_size, dim)
        H_mb = M_mb * H_mb_temp

        # Combine random vectors with observed vectors
        X_mb = M_mb * X_mb + (1 - M_mb) * Z_mb

        _, D_loss_curr = sess.run([D_solver, D_loss_temp],
                                  feed_dict={
                                      M: M_mb,
                                      X: X_mb,
                                      H: H_mb
                                  })
        _, G_loss_curr, MSE_loss_curr = \
        sess.run([G_solver, G_loss_temp, MSE_loss],
                 feed_dict = {X: X_mb, M: M_mb, H: H_mb})

    ## Return imputed data
    Z_mb = uniform_sampler(0, 0.01, no, dim)
    M_mb = data_m
    X_mb = norm_data_x
    X_mb = M_mb * X_mb + (1 - M_mb) * Z_mb

    imputed_data = sess.run([G_sample], feed_dict={X: X_mb, M: M_mb})[0]

    imputed_data = data_m * norm_data_x + (1 - data_m) * imputed_data

    # Renormalization
    imputed_data = renormalization(imputed_data, norm_parameters)

    # Rounding
    imputed_data = rounding(imputed_data, data_x)

    return imputed_data

예제 #5

파일 보기

파일: egain.py 프로젝트: Tommy-Ngx/egan3

def egain(miss_data_x, gain_parameters):
    #def Egain(miss_data_x, gain_parameters):
    '''Impute missing values in data_x
  
  Args:
    - miss_data_x: missing data
    - gain_parameters: GAIN network parameters:
      - batch_size: Batch size
      - alpha: Hyperparameter
      - iterations: Iterations
      
  Returns:
    - imputed_data: imputed data
  '''
    # Define mask matrix
    m = 1 - np.isnan(miss_data_x)

    # System parameters
    batch_size = gain_parameters['batch_size']
    # hint_rate = gain_parameters['hint_rate']
    alpha = gain_parameters['alpha']
    iterations = gain_parameters['iterations']

    loss_type = ['trickLogD', 'minimax', 'ls']
    nloss = 3
    beta = 1.0
    ncandi = 1  #1#3
    nbest = 1  #1#3
    nD = 1  # # of discrim updates for each gen update
    # Other parameters
    no, dim = miss_data_x.shape
    # Hidden state dimensions
    h_dim = int(dim)

    # Normalization
    norm_data, norm_parameters = normalization(miss_data_x)
    norm_data_x = np.nan_to_num(norm_data, 0)

    ## GAIN architecture
    #tf.reset_default_graph()
    tf.compat.v1.get_default_graph()
    # Input placeholders
    # Data vector
    X = tf1.placeholder(tf.float32, shape=[None, dim])
    # Mask vector
    M = tf1.placeholder(tf.float32, shape=[None, dim])
    # B vector
    B = tf1.placeholder(tf.float32, shape=[None, dim])

    # Discriminator variables
    D_W1 = tf.Variable(xavier_init([dim * 2, h_dim]))  # Data + Hint as inputs
    D_b1 = tf.Variable(tf.zeros(shape=[h_dim]))

    D_W2 = tf.Variable(xavier_init([h_dim, h_dim]))
    D_b2 = tf.Variable(tf.zeros(shape=[h_dim]))

    D_W3 = tf.Variable(xavier_init([h_dim, dim]))
    D_b3 = tf.Variable(tf.zeros(shape=[dim]))  # Multi-variate outputs

    theta_D = [D_W1, D_W2, D_W3, D_b1, D_b2, D_b3]

    # Generator variables
    # Data + Mask as inputs (Random noise is in missing components)
    G_W1 = tf.Variable(xavier_init([dim * 2, h_dim]))
    G_b1 = tf.Variable(tf.zeros(shape=[h_dim]))

    G_W2 = tf.Variable(xavier_init([h_dim, h_dim]))
    G_b2 = tf.Variable(tf.zeros(shape=[h_dim]))

    G_W3 = tf.Variable(xavier_init([h_dim, dim]))
    G_b3 = tf.Variable(tf.zeros(shape=[dim]))

    theta_G = [G_W1, G_W2, G_W3, G_b1, G_b2, G_b3]

    ## GAIN functions
    # Generator
    def generator(x, m):
        # Concatenate Mask and Data
        inputs = tf.concat(values=[x, m], axis=1)
        G_h1 = tf.nn.relu(tf.matmul(inputs, G_W1) + G_b1)
        G_h2 = tf.nn.relu(tf.matmul(G_h1, G_W2) + G_b2)
        # MinMax normalized output
        G_prob = tf.nn.sigmoid(tf.matmul(G_h2, G_W3) + G_b3)
        return G_prob

    # Discriminator
    def discriminator(x, h):
        # Concatenate Data and Hint
        inputs = tf.concat(values=[x, h], axis=1)
        D_h1 = tf.nn.relu(tf.matmul(inputs, D_W1) + D_b1)
        D_h2 = tf.nn.relu(tf.matmul(D_h1, D_W2) + D_b2)
        D_logit = tf.matmul(D_h2, D_W3) + D_b3
        D_prob = tf.nn.sigmoid(D_logit)
        return D_prob

    ## GAIN structure
    # Hint vector
    H = B * M + 0.5 * (1 - B)  # 0.5 => 0.1
    # Generator
    G_sample = generator(X, M)
    D_prob_g = discriminator(X * M + G_sample * (1 - M), H)

    # Combine with observed data
    fake_X = tf1.placeholder(tf.float32, shape=[None, dim])
    # Hint vector
    Hat_X = X * M + fake_X * (1 - M)

    # D loss
    D_prob = discriminator(Hat_X, H)
    D_loss_temp = -tf.reduce_mean(
        (M * tf1.log(D_prob + 1e-8) + (1 - M) * tf1.log(1. - D_prob + 1e-8)))
    D_loss = D_loss_temp
    # Updated parramter
    D_solver = tf1.train.AdamOptimizer(learning_rate=0.002,
                                       beta1=0.5,
                                       beta2=0.99).minimize(D_loss,
                                                            var_list=theta_D)

    # G loss
    #Update loss function
    G_loss_logD = -tf.reduce_mean((1 - M) * 1 / 2 * tf1.log(D_prob_g + 1e-8))
    G_loss_minimax = tf.reduce_mean(
        (1 - M) * 1 / 2 * tf1.log(1. - D_prob_g + 1e-8))
    G_loss_ls = tf1.reduce_mean((1 - M) * tf1.square(D_prob_g - 1))

    MSE_loss = tf.reduce_mean((M * X - M * G_sample)**2) / tf.reduce_mean(M)

    G_loss_logD_all = G_loss_logD + alpha * MSE_loss
    G_loss_minimax_all = G_loss_minimax + alpha * MSE_loss
    G_loss_ls_all = G_loss_ls + alpha * MSE_loss

    #Update parramter
    G_solver_logD = tf1.train.AdamOptimizer(learning_rate=0.002,
                                            beta1=0.5,
                                            beta2=0.99).minimize(
                                                G_loss_logD_all,
                                                var_list=theta_G)
    G_solver_minimax = tf1.train.AdamOptimizer(learning_rate=0.002,
                                               beta1=0.5,
                                               beta2=0.99).minimize(
                                                   G_loss_minimax_all,
                                                   var_list=theta_G)
    G_solver_ls = tf1.train.AdamOptimizer(learning_rate=0.002,
                                          beta1=0.5,
                                          beta2=0.99).minimize(
                                              G_loss_ls_all, var_list=theta_G)

    # Fitness function
    Fq_score = tf.reduce_mean((1 - M) * D_prob)
    Fd_score = -tf1.log(
        tf.reduce_sum(tf.square(tf.gradients(D_loss_temp, theta_D[0]))) +
        tf.reduce_sum(tf.square(tf.gradients(D_loss_temp, theta_D[1]))) +
        tf.reduce_sum(tf.square(tf.gradients(D_loss_temp, theta_D[2]))) +
        tf.reduce_sum(tf.square(tf.gradients(D_loss_temp, theta_D[3]))) +
        tf.reduce_sum(tf.square(tf.gradients(D_loss_temp, theta_D[4]))) +
        tf.reduce_sum(tf.square(tf.gradients(D_loss_temp, theta_D[5]))))

    ## Iterations
    sess = tf1.Session()
    # Start Iterations

    gen_new_params = []
    fitness_best = np.zeros(nbest)
    fitness_candi = np.zeros(ncandi)
    # for it in tqdm(range(iterations)):
    for it in tqdm(range(iterations)):
        # Train candidates G
        if it == 0:
            for can_i in range(0, ncandi):
                sess.run(tf1.global_variables_initializer())
                batch_idx = sample_batch_index(no, batch_size)
                X_mb = norm_data_x[batch_idx, :]
                M_mb = m[batch_idx, :]
                Z_mb = uniform_sampler(0.0, 0.01, batch_size, dim)
                X_mb = M_mb * X_mb + (1 - M_mb) * Z_mb
                B_mb = sample_batch_binary(dim, batch_size)
                gen_samples = sess.run([G_sample],
                                       feed_dict={
                                           X: X_mb,
                                           M: M_mb
                                       })[0]
                fq_score, fd_score = sess.run([Fq_score, Fd_score],
                                              feed_dict={
                                                  X: X_mb,
                                                  M: M_mb,
                                                  fake_X: gen_samples,
                                                  B: B_mb
                                              })
                fitness = fq_score + beta * fd_score
                fitness_best[can_i] = fitness
                params = []
                for param in theta_G:
                    params.append(sess.run(param))
                gen_new_params.append(params)
            gen_best_params = copy.deepcopy(gen_new_params)
        else:
            # generate new candidate
            gen_old_params = copy.deepcopy(gen_new_params)
            # print(gen_old_params[0][-1])
            # print(it)
            for can_i in range(ncandi):
                for type_i in range(nloss):
                    batch_idx = sample_batch_index(no, batch_size)
                    X_mb = norm_data_x[batch_idx, :]
                    M_mb = m[batch_idx, :]
                    Z_mb = uniform_sampler(0.0, 0.01, batch_size,
                                           dim)  # update 1.0 ==> 0.01
                    X_mb = M_mb * X_mb + (1 - M_mb) * Z_mb
                    B_mb = sample_batch_binary(dim, batch_size)
                    # Load and update weights
                    for i in range(len(theta_G)):
                        theta_G[i].load(gen_old_params[can_i][i], sess)
                    loss = loss_type[type_i]
                    if loss == 'trickLogD':
                        sess.run([G_solver_minimax],
                                 feed_dict={
                                     X: X_mb,
                                     M: M_mb,
                                     B: B_mb
                                 })
                    elif loss == 'minimax':
                        sess.run([G_solver_logD],
                                 feed_dict={
                                     X: X_mb,
                                     M: M_mb,
                                     B: B_mb
                                 })
                    elif loss == 'ls':
                        sess.run([G_solver_ls],
                                 feed_dict={
                                     X: X_mb,
                                     M: M_mb,
                                     B: B_mb
                                 })

                    # calculate fitness score
                    gen_samples = sess.run([G_sample],
                                           feed_dict={
                                               X: X_mb,
                                               M: M_mb
                                           })[0]
                    fq_score, fd_score = sess.run([Fq_score, Fd_score],
                                                  feed_dict={
                                                      X: X_mb,
                                                      M: M_mb,
                                                      fake_X: gen_samples,
                                                      B: B_mb
                                                  })
                    fitness = fq_score + beta * fd_score
                    # print(fitness)
                    gap = fitness_best - fitness
                    if min(gap) < 0:
                        idx_replace = np.argmin(gap)
                        params = []
                        for param in theta_G:
                            params.append(sess.run(param))
                        gen_best_params[idx_replace] = params
                        fitness_best[idx_replace] = fitness

                    if can_i * nloss + type_i < ncandi:
                        idx = can_i * nloss + type_i
                        params = []
                        for param in theta_G:
                            params.append(sess.run(param))
                        gen_new_params[idx] = params
                        fitness_candi[idx] = fitness
                    else:
                        gap = fitness_candi - fitness
                        if min(gap) < 0:
                            idx_replace = np.argmin(gap)
                            params = []
                            for param in theta_G:
                                params.append(sess.run(param))
                            gen_new_params[idx_replace] = params
                            fitness_candi[idx_replace] = fitness
        # Train D
        for i in range(nD):
            batch_idx = sample_batch_index(no, batch_size)
            X_mb = norm_data_x[batch_idx, :]
            M_mb = m[batch_idx, :]
            Z_mb = uniform_sampler(0.0, 0.01, batch_size, dim)  # 1.0 ==> 0.01
            X_mb = M_mb * X_mb + (1 - M_mb) * Z_mb
            B_mb = sample_batch_binary(dim, batch_size)
            # impute data for each candidat
            for can_i in range(ncandi):
                for w in range(len(theta_G)):
                    theta_G[w].load(gen_new_params[can_i][w], sess)
                if can_i == ncandi - 1:
                    gen_samples_cani = sess.run(
                        [G_sample],
                        feed_dict={
                            X: X_mb[can_i * batch_size // ncandi:],
                            M: M_mb[can_i * batch_size // ncandi:]
                        })[0]
                else:
                    gen_samples_cani = sess.run(
                        [G_sample],
                        feed_dict={
                            X:
                            X_mb[can_i * batch_size // ncandi:(can_i + 1) *
                                 batch_size // ncandi],
                            M:
                            M_mb[can_i * batch_size // ncandi:(can_i + 1) *
                                 batch_size // ncandi]
                        })[0]
                # print(gen_samples_cani.shape)
                if can_i == 0:
                    gen_samples = gen_samples_cani
                else:
                    gen_samples = np.append(gen_samples,
                                            gen_samples_cani,
                                            axis=0)
            sess.run([D_solver],
                     feed_dict={
                         X: X_mb,
                         M: M_mb,
                         fake_X: gen_samples,
                         B: B_mb
                     })

    ## Return imputed data
    idx = np.argmax(fitness_best)
    # print(idx)
    for i in range(len(theta_G)):
        theta_G[i].load(gen_best_params[idx][i], sess)

    Z_mb = uniform_sampler(0.0, 0.01, no, dim)
    M_mb = m
    X_mb = norm_data_x
    X_mb = M_mb * X_mb + (1 - M_mb) * Z_mb

    imputed_data = sess.run([G_sample], feed_dict={X: X_mb, M: M_mb})[0]
    sess.close()
    imputed_data = m * norm_data_x + (1 - m) * imputed_data

    # Renormalization
    imputed_data = renormalization(imputed_data, norm_parameters)

    # Rounding
    imputed_data = rounding(imputed_data, miss_data_x)

    return imputed_data

예제 #6

파일 보기

def gain(data_x, gain_parameters, ori_data_x, train_index, test_index, mechanism):
    '''Impute missing values in data_x

    Args:
      - data_x: original data with missing values
      - gain_parameters: GAIN network parameters:
        - batch_size: Batch size
        - hint_rate: Hint rate
        - alpha: Hyper-parameter
        - iterations: Iterations

    Returns:
      - imputed_data: imputed data
    '''
    # Define mask matrix
    data_m = 1 - np.isnan(data_x)

    # System parameters
    batch_size = gain_parameters['batch_size']
    hint_rate = gain_parameters['hint_rate']
    alpha = gain_parameters['alpha']
    iterations = gain_parameters['iterations']

    # Other parameters
    no, dim = data_x.shape

    no_train = len(train_index)

    # Hidden state dimensions
    h_dim = int(dim)

    # Normalization
    norm_data, norm_parameters = normalization(data_x)
    norm_data_x = np.nan_to_num(norm_data, nan=0)

    # pytorch.
    generator = Generator(dim, h_dim)
    discriminator = Discriminator(dim, h_dim)
    discriminator2 = Discriminator(dim, h_dim)

    # Optimizers
    generator_optimizer = torch.optim.Adam(generator.parameters(), lr=0.0001, betas=(0.5, 0.999))
    discriminator_optimizer = torch.optim.SGD(discriminator.parameters(), lr=0.0001)
    discriminator2_optimizer = torch.optim.SGD(discriminator2.parameters(), lr=0.0001)
    for i in tqdm(range(iterations), desc='pytorch'):

        # if i % 2000 == 0 and i != 0:
        #     test(data_m, data_x, dim, generator, no, norm_data_x, norm_parameters, ori_data_x, test_index)

        # Sample batch
        batch_idx = sample_batch_index(no_train, batch_size)
        X_mb = norm_data_x[train_index][batch_idx, :]
        M_mb = data_m[train_index][batch_idx, :]
        # Sample random vectors
        Z_mb = uniform_sampler(0, 0.01, batch_size, dim)
        # Sample hint vectors

        if mechanism == 'mar':
            H_mb = hint_for_mar(hint_rate, M_mb)  # np.logical_or(1 - h, 1 - mask)
            H_mb_2 = hint_for_mar(hint_rate, M_mb)  # np.logical_or(1 - h, 1 - mask)
        else:
            H_mb = M_mb * binary_sampler(hint_rate, batch_size, dim)
            H_mb_2 = M_mb * binary_sampler(hint_rate, batch_size, dim)

        # Combine random vectors with observed vectors
        X_mb = M_mb * X_mb + (1 - M_mb) * Z_mb

        X_mb = torch.Tensor(X_mb)
        M_mb = torch.Tensor(M_mb)
        H_mb = torch.Tensor(H_mb)
        H_mb_2 = torch.Tensor(H_mb_2)

        G_sample = generator(X_mb, M_mb)
        Hat_X = X_mb * M_mb + G_sample * (1 - M_mb)
        D_prob = discriminator(Hat_X, H_mb)
        D_prob2 = discriminator2(Hat_X, H_mb_2)

        d_loss_value = d_loss(M_mb, D_prob)
        d_loss_value2 = d_loss(M_mb, D_prob2)
        # y = torch.rand(1)
        g_loss_value = g_loss(M_mb, 0.5 * D_prob + 0.5 * D_prob2, alpha, X_mb, G_sample)

        discriminator2_optimizer.zero_grad()
        discriminator_optimizer.zero_grad()
        generator_optimizer.zero_grad()

        g_loss_value.backward(retain_graph=True)
        d_loss_value.backward(retain_graph=True)
        d_loss_value2.backward(retain_graph=True)

        generator_optimizer.step()
        discriminator_optimizer.step()
        discriminator2_optimizer.step()

    test(data_m, data_x, dim, generator, no, norm_data_x, norm_parameters, ori_data_x, test_index)

예제 #7

파일 보기

파일: gain.py 프로젝트: AliennCheng/GAIN

def gain(data_x, gain_parameters):
    '''Impute missing values in data_x

    Args:
    - data_x: original data with missing values
    - gain_parameters: GAIN network parameters:
        - batch_size: Batch size
        - hint_rate: Hint rate
        - alpha: Hyperparameter
        - iterations: Iterations
        
    Returns:
    - imputed_data: imputed data
    '''
    # Define mask matrix
    data_m = (1 - np.isnan(data_x)).astype(float)

    # System parameters
    batch_size = gain_parameters['batch_size']
    hint_rate = gain_parameters['hint_rate']
    alpha = gain_parameters['alpha']
    iterations = gain_parameters['iterations']

    # Other parameters
    no, dim = data_x.shape

    # Hidden state dimensions
    h_dim = int(dim)

    # Normalization
    norm_data, norm_parameters = normalization(data_x)
    norm_data_x = np.nan_to_num(norm_data, 0)

    # parameter initialization
    X = tf.convert_to_tensor(norm_data_x)
    X = tf.dtypes.cast(X, tf.float32)
    M = tf.convert_to_tensor(data_m)
    M = tf.dtypes.cast(M, tf.float32)
    X_input = tf.concat(values=[X, M], axis=1)

    ## GAIN architecture
    # Generator
    class Generator(tf.keras.Model):
        def __init__(self):
            super().__init__()
            self.flatten = layers.Flatten(input_shape=[dim * 2])
            self.dense1 = layers.Dense(h_dim, activation='relu')
            self.dense2 = layers.Dense(h_dim, activation='relu')
            self.dense_output = layers.Dense(dim, activation='sigmoid')
            return

        def call(self, inputs, training=None):
            x = self.flatten(inputs)
            x = self.dense1(x)
            x = self.dense2(x)
            x = self.dense_output(x)
            return x

    # Discriminator
    class Discriminator(tf.keras.Model):
        def __init__(self):
            super().__init__()
            self.flatten = layers.Flatten(input_shape=[dim * 2])
            self.dense1 = layers.Dense(h_dim, activation='relu')
            self.dense2 = layers.Dense(h_dim, activation='relu')
            self.dense_output = layers.Dense(dim, activation='sigmoid')
            return

        def call(self, inputs, training=None):
            x = self.flatten(inputs)
            x = self.dense1(x)
            x = self.dense2(x)
            x = self.dense_output(x)
            return x

    ## GAIN loss
    # Generator
    def generator_loss(generator, discriminator, x, m):
        generator.trainable = True
        discriminator.trainable = False
        G_input = tf.concat(values=[x, m], axis=1)
        G_sample = generator(G_input)
        MSE_loss = tf.reduce_mean(
            (m * x - m * G_sample)**2) / tf.reduce_mean(m)
        D_input = tf.concat(values=[G_sample, m], axis=1)
        D_prob = discriminator(D_input)
        G_loss_tmp = -tf.reduce_mean((1 - m) * tf.math.log(D_prob + 1e-8))
        return G_loss_tmp + alpha * MSE_loss

    # Discriminator
    def discriminator_loss(generator, discriminator, x, m, h):
        generator.trainable = False
        discriminator.trainable = True
        G_input = tf.concat(values=[x, m], axis=1)
        G_sample = generator(G_input)
        x_hat = x * m + G_sample * (1 - m)
        D_input = tf.concat(values=[x_hat, h], axis=1)
        D_prob = discriminator(D_input)
        return -tf.reduce_mean(m * tf.math.log(D_prob + 1e-8) \
                + (1-m) * tf.math.log(1. - D_prob + 1e-8))

    # Build
    generator = Generator()
    generator.build(input_shape=(None, 2 * dim))
    g_optimizer = tf.keras.optimizers.Adam()
    discriminator = Discriminator()
    discriminator.build(input_shape=(None, 2 * dim))
    d_optimizer = tf.keras.optimizers.Adam()

    # Training
    one_tensor = tf.constant(1., shape=(batch_size, dim), dtype=float)

    for _ in tqdm(range(iterations)):
        # Sample batch
        batch_idx = sample_batch_index(no, batch_size)
        X_mb = tf.gather(X, batch_idx)
        M_mb = tf.gather(M, batch_idx)
        Z_mb = tf.convert_to_tensor(uniform_sampler(0, 0.01, batch_size, dim),
                                    dtype=float)
        H_mb_tmp = tf.convert_to_tensor(binary_sampler(hint_rate, batch_size,
                                                       dim),
                                        dtype=float)
        H_mb = tf.math.multiply(M_mb, H_mb_tmp)

        # Combine random vectors with observed vectors
        # X_mb = M_mb * X_mb + (1-M_mb) * Z_mb
        X_mb = tf.math.add(tf.math.multiply(M_mb, X_mb), \
                tf.math.multiply(tf.math.subtract(one_tensor, M_mb), Z_mb))

        # training Discriminator
        with tf.GradientTape() as tape:
            d_loss = discriminator_loss(generator, discriminator, X_mb, M_mb,
                                        H_mb)
        grads = tape.gradient(d_loss, discriminator.trainable_variables)
        d_optimizer.apply_gradients(
            zip(grads, discriminator.trainable_variables))

        # training Generator
        with tf.GradientTape() as tape:
            g_loss = generator_loss(generator, discriminator, X_mb, M_mb)
        grads = tape.gradient(g_loss, generator.trainable_variables)
        g_optimizer.apply_gradients(zip(grads, generator.trainable_variables))

    ## Return imputed data
    imputed_data = np.array([]).reshape(0, dim)
    train_data = tf.data.Dataset.from_tensor_slices(X_input).batch(batch_size)
    train_data_iter = iter(train_data)
    while True:
        try:
            batch = next(train_data_iter)
        except StopIteration:
            break
        X_tmp = generator(batch).numpy()
        imputed_data = np.vstack([imputed_data, X_tmp])

    # Renormalization
    imputed_data = renormalization(imputed_data, norm_parameters)

    # Recovery
    imputed_data = data_m * np.nan_to_num(data_x) + (1 - data_m) * imputed_data

    # Rounding
    imputed_data = rounding(imputed_data, data_x)

    return imputed_data

예제 #8

파일 보기

from utils import sample_batch_index, binary_sampler
from tqdm import trange

if __name__ == '__main__':
    # Load data
    file_name = 'data/house.csv'
    house_df = pd.read_csv(file_name)
    no, dim = house_df.shape

    data_x = house_df.values.astype(np.float32)
    num_samples = 200

    miss_rate = 0.3
    for i in trange(num_samples):
        # random samples
        sample_idx = sample_batch_index(no, 10000)
        data_x_i = data_x[sample_idx, :]
        no_i, dim_i = data_x_i.shape
        np.savetxt("./samples/complete/sample_{}.csv".format(i),
                   data_x_i,
                   delimiter=",")

        # Introduce missing data
        data_m = binary_sampler(1 - miss_rate, no_i, dim_i)
        miss_data_x = data_x_i.copy()
        miss_data_x[data_m == 0] = np.nan
        np.savetxt("./samples/MCAR/sample_{}.csv".format(i),
                   miss_data_x,
                   delimiter=",")

예제 #9

파일 보기

def gain(miss_data_x, gain_parameters):
    '''Impute missing values in data_x
  
  Args:
    - miss_data_x: missing data
    - gain_parameters: GAIN network parameters:
      - batch_size: Batch size
      - alpha: Hyperparameter
      - iterations: Iterations
      
  Returns:
    - imputed_data: imputed data
  '''
    # Define mask matrix
    m = 1 - np.isnan(miss_data_x)

    # System parameters
    batch_size = gain_parameters['batch_size']
    # hint_rate = gain_parameters['hint_rate']
    alpha = gain_parameters['alpha']
    iterations = gain_parameters['iterations']

    # Other parameters
    no, dim = miss_data_x.shape

    # Hidden state dimensions
    h_dim = int(dim)

    # Normalization
    norm_data, norm_parameters = normalization(miss_data_x)
    norm_data_x = np.nan_to_num(norm_data, 0)

    ## GAIN architecture
    tf1.reset_default_graph()
    # Input placeholders
    # Data vector
    X = tf1.placeholder(tf.float32, shape=[None, dim])
    # Mask vector
    M = tf1.placeholder(tf.float32, shape=[None, dim])
    # # Hint vector
    # H = tf.placeholder(tf.float32, shape = [None, dim])
    # B vector
    B = tf1.placeholder(tf.float32, shape=[None, dim])

    # Discriminator variables
    D_W1 = tf1.Variable(xavier_init([dim * 2, h_dim]))  # Data + Hint as inputs
    D_b1 = tf1.Variable(tf.zeros(shape=[h_dim]))

    D_W2 = tf1.Variable(xavier_init([h_dim, h_dim]))
    D_b2 = tf1.Variable(tf.zeros(shape=[h_dim]))

    D_W3 = tf1.Variable(xavier_init([h_dim, dim]))
    D_b3 = tf1.Variable(tf.zeros(shape=[dim]))  # Multi-variate outputs

    theta_D = [D_W1, D_W2, D_W3, D_b1, D_b2, D_b3]

    #Generator variables
    # Data + Mask as inputs (Random noise is in missing components)
    G_W1 = tf.Variable(xavier_init([dim * 2, h_dim]))
    G_b1 = tf.Variable(tf.zeros(shape=[h_dim]))

    G_W2 = tf.Variable(xavier_init([h_dim, h_dim]))
    G_b2 = tf.Variable(tf.zeros(shape=[h_dim]))

    G_W3 = tf.Variable(xavier_init([h_dim, dim]))
    G_b3 = tf.Variable(tf.zeros(shape=[dim]))

    theta_G = [G_W1, G_W2, G_W3, G_b1, G_b2, G_b3]

    ## GAIN functions
    # Generator
    def generator(x, m):
        # Concatenate Mask and Data
        inputs = tf.concat(values=[x, m], axis=1)
        G_h1 = tf.nn.relu(tf.matmul(inputs, G_W1) + G_b1)
        G_h2 = tf.nn.relu(tf.matmul(G_h1, G_W2) + G_b2)
        # MinMax normalized output
        G_prob = tf.nn.sigmoid(tf.matmul(G_h2, G_W3) + G_b3)
        return G_prob

    # Discriminator
    def discriminator(x, h):
        # Concatenate Data and Hint
        inputs = tf.concat(values=[x, h], axis=1)
        D_h1 = tf.nn.relu(tf.matmul(inputs, D_W1) + D_b1)
        D_h2 = tf.nn.relu(tf.matmul(D_h1, D_W2) + D_b2)
        D_logit = tf.matmul(D_h2, D_W3) + D_b3
        D_prob = tf.nn.sigmoid(D_logit)
        return D_prob

    ## GAIN structure

    # Generator
    G_sample = generator(X, M)
    H = B * M + 0.5 * (1 - B)
    D_prob_g = discriminator(X * M + G_sample * (1 - M), H)

    fake_X = tf1.placeholder(tf.float32, shape=[None, dim])
    # Hint vector
    Hat_X = X * M + fake_X * (1 - M)
    # Discriminator
    D_prob = discriminator(Hat_X, H)

    # GAIN loss
    # D_loss_temp = -tf.reduce_mean((1-B)*(M * tf.log(D_prob + 1e-8) \
    #                               + (1-M) * tf.log(1. - D_prob + 1e-8))) \
    #                               / tf.reduce_mean(1-B)
    #
    # G_loss_temp = -tf.reduce_mean((1-B)*(1-M) * tf.log(D_prob + 1e-8)) / tf.reduce_mean(1-B)
    D_loss_temp = -tf.reduce_mean((M * tf1.log(D_prob + 1e-8) \
                                  + (1-M) * tf1.log(1. - D_prob + 1e-8)))

    G_loss_temp = -tf.reduce_mean((1 - M) * tf1.log(D_prob_g + 1e-8))
    MSE_loss = tf.reduce_mean((M * X - M * G_sample)**2) / tf.reduce_mean(M)

    D_loss = D_loss_temp
    G_loss = G_loss_temp + alpha * MSE_loss

    ## GAIN solver
    D_solver = tf1.train.AdamOptimizer().minimize(D_loss, var_list=theta_D)
    G_solver = tf1.train.AdamOptimizer().minimize(G_loss, var_list=theta_G)

    ## Iterations
    sess = tf1.Session()
    sess.run(tf1.global_variables_initializer())
    gen_new_params = []
    params = []
    for param in theta_G:
        params.append(sess.run(param))
    gen_new_params.append(params)

    for it in range(iterations):
        # for it in tqdm(range(iterations)):
        # Sample batch
        # print(sess.run(theta_G[-1]))
        gen_old_params = copy.deepcopy(gen_new_params)
        batch_idx = sample_batch_index(no, batch_size)
        X_mb = norm_data_x[batch_idx, :]
        M_mb = m[batch_idx, :]
        # Sample random vectors
        Z_mb = uniform_sampler(0.0, 0.01, batch_size, dim)
        # Sample hint vectors
        # H_mb_temp = binary_sampler(0.9, batch_size, dim)
        # H_mb = M_mb * H_mb_temp
        # H_mb_temp = binary_sampler(hint_rate, batch_size, dim)
        B_mb = sample_batch_binary(dim, batch_size)
        # H_mb = B_mb*M_mb + 0.5*(1-B_mb)

        # Combine random vectors with observed vectors
        X_mb = M_mb * X_mb + (1 - M_mb) * Z_mb
        f_mb = sess.run([G_sample], feed_dict={X: X_mb, M: M_mb})[0]
        # print(f_mb)
        for w in range(len(theta_G)):
            theta_G[w].load(gen_new_params[0][w], sess)
        _, D_loss_curr = sess.run([D_solver, D_loss_temp],
                                  feed_dict={
                                      X: X_mb,
                                      M: M_mb,
                                      fake_X: f_mb,
                                      B: B_mb
                                  })

        batch_idx = sample_batch_index(no, batch_size)
        X_mb = norm_data_x[batch_idx, :]
        M_mb = m[batch_idx, :]
        # Sample random vectors
        Z_mb = uniform_sampler(0.0, 0.01, batch_size, dim)
        # Sample hint vectors
        # H_mb_temp = binary_sampler(0.9, batch_size, dim)
        # H_mb = M_mb * H_mb_temp
        # H_mb_temp = binary_sampler(hint_rate, batch_size, dim)
        B_mb = sample_batch_binary(dim, batch_size)
        # H_mb = B_mb*M_mb + 0.5*(1-B_mb)

        # Combine random vectors with observed vectors
        X_mb = M_mb * X_mb + (1 - M_mb) * Z_mb
        for w in range(len(theta_G)):
            theta_G[w].load(gen_old_params[0][w], sess)
        _, G_loss_curr, MSE_loss_curr = \
        sess.run([G_solver, G_loss_temp, MSE_loss],
                 feed_dict = {X: X_mb, M: M_mb, B: B_mb})
        params = []
        for param in theta_G:
            params.append(sess.run(param))
        gen_new_params[0] = params
    ## Return imputed data
    Z_mb = uniform_sampler(0.0, 0.01, no, dim)
    M_mb = m
    X_mb = norm_data_x
    X_mb = M_mb * X_mb + (1 - M_mb) * Z_mb
    for w in range(len(theta_G)):
        theta_G[w].load(gen_new_params[0][w], sess)
    imputed_data = sess.run([G_sample], feed_dict={X: X_mb, M: M_mb})[0]
    sess.close()
    imputed_data = m * norm_data_x + (1 - m) * imputed_data

    # Renormalization
    imputed_data = renormalization(imputed_data, norm_parameters)

    # Rounding
    imputed_data = rounding(imputed_data, miss_data_x)

    return imputed_data