/
C2ST_utils.py
246 lines (168 loc) · 8.27 KB
/
C2ST_utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
"""
This script contains various two-sample tests used in the experiments of 'Hypothesis Testing with Uncertain Curves'
We acknowledge the use of existing code from the repositories in https://github.com/wittawatj/ that served as a skeleton for the implementation of some of our tests, especially kernel-based tests. Thank you!
"""
from __future__ import print_function
from __future__ import division
from builtins import str
from builtins import range
from past.utils import old_div
from builtins import object
import autograd
import autograd.numpy as np
import tensorflow as tf
import scipy
import scipy.stats as stats
import math
from random import sample
from scipy.stats import norm as normal
from tensorflow.python.framework import ops
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import (RBF, Matern, RationalQuadratic, ExpSineSquared, WhiteKernel)
from sklearn.gaussian_process.kernels import ConstantKernel as C
from scipy.interpolate import interp1d
from scipy.stats import rankdata
from numpy.linalg import inv
# -------------------------------------------------------------------------------------------
# -------------------------------------------------------------------------------------------
def train_test_split(T1,X1,T2,X2,train_rate=0.8):
"""
:param train_rate: fraction of data used for training
:param parameters: specification for the data generation of two scenarios
:return:training and testing data for C2ST, note each is a combination of data from two samples
"""
# %% Data Preprocessing
dataX1 = np.zeros((X1.shape[0],X1.shape[1],2))
dataX2 = np.zeros((X2.shape[0], X2.shape[1], 2))
# Dataset build
for i in range(len(X1)):
dataX1[i,:,:] = np.hstack((X1[i,np.newaxis].T,T1[i,np.newaxis].T))
dataX2[i, :, :] = np.hstack((X2[i, np.newaxis].T, T2[i, np.newaxis].T))
dataY1 = np.random.choice([0],size=(len(dataX1),)); dataY2 = np.random.choice([1],size=(len(dataX2),))
dataY1 = dataY1[:,np.newaxis]; dataY2 = dataY2[:,np.newaxis]
dataX = Permute(np.vstack((dataX1,dataX2)))
dataY = Permute(np.vstack((dataY1,dataY2)))
# %% Train / Test Division
train_size = int(len(dataX) * train_rate)
trainX, testX = np.array(dataX[0:train_size]), np.array(dataX[train_size:len(dataX)])
trainY, testY = np.array(dataY[0:train_size]), np.array(dataY[train_size:len(dataX)])
return trainX, trainY, testX, testY
def Permute(x,seed=1):
np.random.seed(seed)
n = len(x)
idx = np.random.permutation(n)
out = x[idx]
return out
# TODO make this a class
def C2ST(t1,y1,t2,y2,output='p_value',train_rate = 0.5):
"""
Classifier two sample test following the procedure from (Lopez-Paz et al, 2017). We train an RNN that takes
irregular time points and observations at these points to return a prediction of underlying sampling population
Note: it requires an equal number of observations in each trajectory but observation times can be arbitrary
Return:p-value for the hypothesis that two samples were generated from the same underlying stochastic process
"""
# 3. Data Loading
trainX, trainY, testX, testY = train_test_split(t1,y1,t2,y2,train_rate)
# %% Main Function
# 1. Graph Initialization
ops.reset_default_graph()
# 2. Parameters
seq_length = len(trainX[0, :, 0])
input_size = len(trainX[0, 0, :])
target_size = len(trainY[0, :])
learning_rate = 0.01
iterations = 500
hidden_layer_size = 10
batch_size = 64
# 3. Weights and Bias
Wr = tf.Variable(tf.zeros([input_size, hidden_layer_size]))
Ur = tf.Variable(tf.zeros([hidden_layer_size, hidden_layer_size]))
br = tf.Variable(tf.zeros([hidden_layer_size]))
Wu = tf.Variable(tf.zeros([input_size, hidden_layer_size]))
Uu = tf.Variable(tf.zeros([hidden_layer_size, hidden_layer_size]))
bu = tf.Variable(tf.zeros([hidden_layer_size]))
Wh = tf.Variable(tf.zeros([input_size, hidden_layer_size]))
Uh = tf.Variable(tf.zeros([hidden_layer_size, hidden_layer_size]))
bh = tf.Variable(tf.zeros([hidden_layer_size]))
# Weights for Attention
Wa1 = tf.Variable(tf.random.truncated_normal([hidden_layer_size + input_size, hidden_layer_size], mean=0, stddev=.01))
Wa2 = tf.Variable(tf.random.truncated_normal([hidden_layer_size, target_size], mean=0, stddev=.01))
ba1 = tf.Variable(tf.random.truncated_normal([hidden_layer_size], mean=0, stddev=.01))
ba2 = tf.Variable(tf.random.truncated_normal([target_size], mean=0, stddev=.01))
# Weights for output layers
Wo = tf.Variable(tf.random.truncated_normal([hidden_layer_size, target_size], mean=0, stddev=.01))
bo = tf.Variable(tf.random.truncated_normal([target_size], mean=0, stddev=.01))
# 4. Place holder
# Target
Y = tf.compat.v1.placeholder(tf.float32, [None, 1])
# Input vector with shape[batch, seq, embeddings]
_inputs = tf.compat.v1.placeholder(tf.float32, shape=[None, None, input_size], name='inputs')
# Function to convert batch input data to use scan ops of tensorflow.
def process_batch_input_for_RNN(batch_input):
batch_input_ = tf.transpose(batch_input, perm=[2, 0, 1])
X = tf.transpose(batch_input_)
return X
# Processing inputs to work with scan function
processed_input = process_batch_input_for_RNN(_inputs)
# Initial Hidden States
initial_hidden = _inputs[:, 0, :]
initial_hidden = tf.matmul(initial_hidden, tf.zeros([input_size, hidden_layer_size]))
# 5. Function for Forward GRU cell.
def GRU(previous_hidden_state, x):
# R Gate
r = tf.sigmoid(tf.matmul(x, Wr) + tf.matmul(previous_hidden_state, Ur) + br)
# U Gate
u = tf.sigmoid(tf.matmul(x, Wu) + tf.matmul(previous_hidden_state, Uu) + bu)
# Final Memory cell
c = tf.tanh(tf.matmul(x, Wh) + tf.matmul(tf.multiply(r, previous_hidden_state), Uh) + bh)
# Current Hidden state
current_hidden_state = tf.multiply((1 - u), previous_hidden_state) + tf.multiply(u, c)
return current_hidden_state
# 6. Function to get the hidden and memory cells after forward pass
def get_states():
# Getting all hidden state through time
all_hidden_states = tf.scan(GRU, processed_input, initializer=initial_hidden, name='states')
return all_hidden_states
# %% Attention
# Function to get attention with the last input
def get_attention(hidden_state):
inputs = tf.concat((hidden_state, processed_input[-1]), axis=1)
hidden_values = tf.nn.tanh(tf.matmul(inputs, Wa1) + ba1)
e_values = (tf.matmul(hidden_values, Wa2) + ba2)
return e_values
# Function for getting output and attention coefficient
def get_outputs():
all_hidden_states = get_states()
all_attention = tf.map_fn(get_attention, all_hidden_states)
a_values = tf.nn.softmax(all_attention, axis=0)
final_hidden_state = tf.einsum('ijk,ijl->jkl', a_values, all_hidden_states)
output = tf.nn.sigmoid(tf.matmul(final_hidden_state[:, 0, :], Wo) + bo)
return output, a_values
# Getting all outputs from rnn
outputs, attention_values = get_outputs()
# reshape out for sequence_loss
loss = tf.sqrt(tf.reduce_mean(tf.square(outputs - Y)))
# Optimization
optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate)
train = optimizer.minimize(loss)
# Sessions
sess = tf.compat.v1.Session()
sess.run(tf.compat.v1.global_variables_initializer())
# 3. Sample from the real data (Mini-batch index sampling)
def sample_X(m, n):
return np.random.permutation(m)[:n]
# Training step
for i in range(iterations):
idx = sample_X(len(trainX[:, 0, 0]), batch_size)
Input = trainX[idx, :, :]
_, step_loss = sess.run([train, loss], feed_dict={Y: trainY[idx], _inputs: Input})
if i % 100 == 0:
print("[step: {}] loss: {}".format(i, step_loss))
# %% Evaluation
final_outputs, final_attention_values = sess.run([outputs, attention_values], feed_dict={_inputs: testX})
accuracy = np.mean(np.round(final_outputs)==testY)
p_value = 1 - normal.cdf(accuracy,1/2,np.sqrt(1/(4*len(testX))))
if output == 'stat':
return accuracy
if output == 'p_value':
return p_value