-
Notifications
You must be signed in to change notification settings - Fork 0
/
simulated_data_binarycause.py
174 lines (154 loc) · 6.66 KB
/
simulated_data_binarycause.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
import logging
import numpy as np
import numpy.random as npr
import pandas as pd
import scipy.stats
from scipy import sparse, stats
from scipy.special import expit
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
logger = logging.getLogger(__name__)
class gwas_simulated_data(object):
# Reference:
# https://github.com/raquelaoki/ParKCa/blob/master/src/datapreprocessing.py
def __init__(self, n_units=10000, n_causes=100, seed=4, pca_path='data/tgp_pca2.txt', prop_tc=0.1,
true_causes=None, unit_test=False):
self.n_units = n_units
self.n_causes = n_causes
if true_causes is None:
self.true_causes = np.max([1, int(n_causes * prop_tc)])
else:
self.true_causes = true_causes
self.confounders = self.n_causes - self.true_causes
self.seed = seed
self.pca_path = pca_path
try:
self.S = np.loadtxt(self.pca_path, delimiter=',')
except:
self.S = np.loadtxt('M3E2/CompBioAndSimulated_Datasets' + self.pca_path, delimiter=',')
self.prop_tc = prop_tc
self.unit_test = unit_test
if self.unit_test:
logging.basicConfig(level=logging.DEBUG)
logger.debug('Dataset - GWAS initialized!')
def generate_samples(self, prop=[0.2, 0.2, 0.05]):
"""
Input:
n_units, n_causes: dimentions
snp_simulated datasets
y: output simulated and truecases for each datset are together in a single matrix
Note: There are options to load the data from vcf format and run the pca
Due running time, we save the files and load from the pca.txt file
G = X
"""
G0, lambdas = self.sim_genes_TGP(D=3, prop=prop)
x, y, t, tau = self.sim_dataset(G0, lambdas)
y = y.reshape(self.n_units, -1)
return x, y, t, tau
def sim_genes_TGP(self, D, prop=[0.2, 0.2, 0.05]):
"""
#Adapted from Deconfounder's authors
generate the simulated data
input:
- Fs, ps, n_hapmapgenes: not adopted in this example
- n_causes = integer
- n_units = m (columns)
- S: PCA output n x 2
"""
np.random.seed(self.seed)
S = expit(self.S)
Gammamat = np.zeros((self.n_causes, 3))
Gammamat[:, 0] = prop[0] * npr.uniform(size=self.n_causes) # 0.2
Gammamat[:, 1] = prop[1] * npr.uniform(size=self.n_causes) # 0.2
Gammamat[:, 2] = prop[2] * np.ones(self.n_causes)
S = np.column_stack((S[npr.choice(S.shape[0], size=self.n_units, replace=True),], \
np.ones(self.n_units)))
# print(S[0:5,0:5])
F = S.dot(Gammamat.T)
# it was 2 instead of 1: goal is make SNPs binary
G = npr.binomial(1, F)
# unobserved group
lambdas = KMeans(n_clusters=3, random_state=123).fit(S).labels_
# sG = sparse.csr_matrix(G)
return G, lambdas
def sim_dataset(self, G0, lambdas):
"""
calculate the target Y based on the simulated dataset
input:
G0: level 0 data
lambdas: unknown groups
n_causes and n_units: int, dimensions of the dataset
output:
G: G0 in pandas format with colnames that indicate if its a cause or not
tc: causal columns
y01: binary target
"""
np.random.seed(self.seed)
tc_ = npr.normal(loc=0, scale=0.5 * 0.5, size=self.true_causes)
tc = np.hstack((tc_, np.repeat(0.0, self.confounders))) # True causes
tau = stats.invgamma(3, 1).rvs(3, random_state=99)
sigma = np.zeros(self.n_units)
sigma = [tau[0] if lambdas[j] == 0 else sigma[j] for j in range(len(sigma))]
sigma = [tau[1] if lambdas[j] == 1 else sigma[j] for j in range(len(sigma))]
sigma = [tau[2] if lambdas[j] == 2 else sigma[j] for j in range(len(sigma))]
y0 = np.array(tc).reshape(1, -1).dot(np.transpose(G0))
l1 = lambdas.reshape(1, -1)
y1 = (np.sqrt(np.var(y0)) / np.sqrt(0.4)) * (np.sqrt(0.4) / np.sqrt(np.var(l1))) * l1
e = npr.normal(0, sigma, self.n_units).reshape(1, -1)
y2 = (np.sqrt(np.var(y0)) / np.sqrt(0.4)) * (np.sqrt(0.2) / np.sqrt(np.var(e))) * e
p = 1 / (1 + np.exp(y0 + y1 + y2))
y01 = [npr.binomial(1, p[0][i], 1)[0] for i in range(len(p[0]))]
y01 = np.asarray(y01)
G, col = self.add_colnames(G0, tc)
treatment = G.iloc[:, col].values.reshape(-1)
G.drop(G.columns[col].values, axis=1, inplace=True)
y = y0 + y1 + y2
logger.debug('... Covariates: %i', G.shape[1] - len(col))
logger.debug('... Target (y) : %f', np.sum(y01) / len(y01))
logger.debug('... Sample Size: %i', G.shape[0])
if len(col) == 1:
T = G.iloc[:, col[0]].values
logger.debug('... Proportion of T: %f', sum(T) / len(T))
logger.debug('Dataset - GWAS Done!')
return G, y, treatment, tc_
def add_colnames(self, data, truecauses):
"""
from matrix to pandas dataframe, adding colnames
"""
colnames = []
causes = 0
noncauses = 0
columns = []
for i in range(len(truecauses)):
if truecauses[i] != 0:
colnames.append('causal_' + str(causes))
causes += 1
columns.append(i)
else:
colnames.append('noncausal_' + str(noncauses))
noncauses += 1
data = pd.DataFrame(data)
data.columns = colnames
return data, columns
class ihdp_data(object):
# source code: https://github.com/AMLab-Amsterdam/CEVAE.git
def __init__(self, id=1, path='/content/CEVAE/datasets/IHDP/'):
data = pd.read_csv(path + 'ihdp_npci_' + str(id) + '.csv', sep=',', header=None)
columns = ['treatment', 'y_factual', 'y_cfactual', 'mu0', 'mu1', 'x1', 'x2', 'x3', 'x4', 'x5', 'x6', 'x7', 'x8',
'x9', 'x10', 'x11', 'x12', 'x13', 'x14', 'x15', 'x16', 'x17', 'x18', 'x19', 'x20', 'x21', 'x22',
'x23', 'x24', 'x25']
data.columns = columns
self.data = data
#self.binfeat = list(range(6, 25))
logger.debug('Dataset - IHDP initialized!')
def generate_samples(self):
X = self.data.drop(['y_factual', 'y_cfactual', 'mu0', 'mu1'], axis=1)
y = self.data['y_factual'].values
col = [0]
tau = self.data['mu1'].mean() - self.data['mu0'].mean()
t = X.iloc[:, col].values.reshape(-1)
X.drop('treatment', axis=1, inplace=True)
logger.debug('Dataset - IHDP Done!')
#for col in self.binfeat:
# X[:, col][X[:, col] == 2] = 0
return X, y, t, tau