-
Notifications
You must be signed in to change notification settings - Fork 0
/
training.py
96 lines (79 loc) · 3.22 KB
/
training.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
from utils import generate, read_pdb
from model import build_model
from keras import optimizers, losses, callbacks
import os
import psutil
def initializer():
global proteins
global ligands
proteins = [[None, None]]
ligands = [[None, None]]
for i in range(1, RANGE):
p_coordinates, p_atom_types = read_pdb("training_data/{0}_pro_cg.pdb".format('%04d' % i))
l_coordinates, l_atom_types = read_pdb("training_data/{0}_lig_cg.pdb".format('%04d' % i))
proteins.append([p_coordinates, p_atom_types])
ligands.append([l_coordinates, l_atom_types])
print("Loaded training dataset")
def parallel_generate(i):
data = []
labels = []
for j in range(1, RANGE):
grids = generate(ligands[j][0], ligands[j][1], proteins[i][0], proteins[i][1], RADIUS, DISTANCE_THRESHOLD)
data.extend(grids)
label = 1 if i == j else 0
labels.extend([label] * (len(grids)))
return data, labels
def generate_training_data_parallel():
import multiprocessing
cores = multiprocessing.cpu_count()
pool = multiprocessing.Pool(cores, initializer)
print("Generating training examples on {} CPU cores".format(cores))
data, labels = zip(*pool.map(parallel_generate, range(1, RANGE)))
pool.close()
pool.join()
data = [item for sublist in data for item in sublist]
labels = [item for sublist in labels for item in sublist]
return data, labels
def generate_training_data():
data = []
labels = []
proteins = [[None, None]]
ligands = [[None, None]]
for i in range(1, RANGE):
p_coordinates, p_atom_types = read_pdb("training_data/{0}_pro_cg.pdb".format('%04d' % i))
l_coordinates, l_atom_types = read_pdb("training_data/{0}_lig_cg.pdb".format('%04d' % i))
proteins.append([p_coordinates, p_atom_types])
ligands.append([l_coordinates, l_atom_types])
for i in range(1, RANGE):
for j in range(1, RANGE):
grids = generate(ligands[j][0], ligands[j][1], proteins[i][0], proteins[i][1], RADIUS, DISTANCE_THRESHOLD)
data.extend(grids)
label = 1 if i == j else 0
labels.extend([label]*(len(grids)))
return data, labels
def training():
dimension = RADIUS * 2 + 1
model = build_model(input_shape=(dimension, dimension, dimension, 3))
# utils.plot_model(model, to_file='model.png')
adam = optimizers.Adam()
model.compile(optimizer=adam, loss=losses.binary_crossentropy, metrics=["accuracy"])
data, labels = generate_training_data_parallel()
# data, labels = generate_training_data()
print("Generated {} examples".format(len(data)))
# memory usage
process = psutil.Process(os.getpid())
print("Used total memory: {}".format(process.memory_info().rss))
print("Starting training")
model.fit([data], [labels], validation_split=0.2, batch_size=100, epochs=5, callbacks=[callbacks.EarlyStopping()])
print("Saving model")
# serialize model to JSON
model_json = model.to_json()
with open("model.json", "w") as json_file:
json_file.write(model_json)
# serialize weights to HDF5
model.save_weights("model.h5")
if __name__ == '__main__':
RANGE = 300
RADIUS = 10
DISTANCE_THRESHOLD = 10
training()