-
Notifications
You must be signed in to change notification settings - Fork 0
/
common.py
executable file
·451 lines (360 loc) · 18.2 KB
/
common.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
import numpy as np
import math
import itertools
import sys
import time
import os
import random
import h5py
import logging
import pandas as pd
import csv
mydir = os.path.dirname(__file__)
sys.path.append(os.path.join(mydir, '.'))
from cudamat import gnumpy as gp
class Unbuffered:
def __init__(self, stream):
self.stream = stream
def write(self, data):
self.stream.write(data)
self.stream.flush()
def __getattr__(self, attr):
return getattr(self.stream, attr)
sys.stdout = Unbuffered(sys.stdout) # for logging into file to work
# wait gpu
_waitGpu = os.environ.get('DEEPLINK_WAIT_GPU', 'no')
if(_waitGpu == 'yes'):
locked_gpu = False
total_wait_seconds = 0
while(not locked_gpu):
try:
gp.garray(np.zeros(1))
locked_gpu = True
print 'GPU board is available after waited %d seconds' % total_wait_seconds
except:
locked_gpu = False
if(total_wait_seconds==0): print 'No GPU board is available, waiting...'
seconds = 600 + random.randint(-500, 500)
time.sleep(seconds) # sleep 5 minutes to re-check
total_wait_seconds += seconds
# create logger
logger = logging.getLogger('deeplink_logger')
logger.setLevel(logging.DEBUG)
# create console handler and set level to debug
ch = logging.StreamHandler()
ch.setLevel(logging.DEBUG)
# create formatter
formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
# add formatter to ch
ch.setFormatter(formatter)
# add ch to logger
logger.addHandler(ch)
def get_batches(data, batch_size):
'''
batch_size: an integer, size of each training batch
'''
m = data.shape[0]
num_batches = int(math.ceil(1.0 * m / batch_size))
bsize = batch_size #int(m / num_batches)
batch_sizes = np.zeros(num_batches, 'i')
for i in range(num_batches):
i1 = i * bsize
i2 = min(m, i1 + bsize)
batch_sizes[i] = i2 - i1
#print "%d"% (batch_sizes[i])
# print 'There are '+str(num_batches)+' batches'
return bsize, batch_sizes
def garrayify(arrays):
return [ar if isinstance(ar, gp.garray) else gp.garray(ar) for ar in arrays]
def numpyify(arrays):
return [ar if isinstance(ar, np.ndarray) else ar.as_numpy_array(dtype=np.float32) for ar in arrays]
def columnRMS(W):
return gp.sqrt(gp.mean(W * W, axis=0))
def limitColumnRMS(W, rmsLim):
"""
All columns of W with rms entry above the limit are scaled to equal the limit.
The limit can either be a row vector or a scalar.
"""
rmsScale = rmsLim / columnRMS(W)
return W * (1 + (rmsScale < 1) * (rmsScale - 1))
def bernoulli(gpu_hidden):
return gpu_hidden.rand() < gpu_hidden
def force_gpu_deallocate(a):
if(not isinstance(a, gp.garray)): return
try:
thesize = a.shape[0] * a.shape[1]
del a # delete variable
gp._cmsForReuse[thesize].pop()
gp.__memoryInUse -= thesize * 4
del gc.garbage[:]
except IndexError:
# Can't deallocate if it don't exist, can ya?
pass
class gpu_buffer(object):
'''
A gpu memory buffer manager, which makes efficient usage of gpu memory:
1. reuse previously allocated gpu buffer whenever it is possible
2. free gpu memory as soon as it's not being used
2. reduce cpu <-> gpu IO as much as possible
'''
_gpu_buffer = None # static class level data member
_gpu_label_buffer = None
_gpu_buffer_size = 0 # number of float32
def __init__(self, data, label=None, batch_size=128, randomize=False, gpu_buffer_size_MB=512):
nsize = int(gpu_buffer_size_MB * 1024 * 1024 / 4) # float32 take 4 bytes
self.data = data
self.label = label
self.batch_size = batch_size
self.randomize = randomize
columns = self.data.shape[1] if self.label is None else 1 + self.data.shape[1]
self.num_rows = int(nsize / columns) # maximum number of rows can hold in buffer
# align with the minibatch_size
self.num_rows = int(self.num_rows / self.batch_size) * self.batch_size
#print "number of rows to be allocated in gpu: %d"%self.num_rows
# allocate gpu memory only once, reuse after worth if possible
if(gpu_buffer._gpu_buffer_size < self.num_rows * self.data.shape[1]):
# requesting bigger buffer, has to re-allocate and resize
print "requesting buffer with size of %d float32" % (self.num_rows * self.data.shape[1])
# if(gpu_buffer._gpu_buffer is not None): gpu_buffer._gpu_buffer._free_device_memory()
gpu_buffer._gpu_buffer = gp.zeros((self.num_rows, self.data.shape[1]))
gpu_buffer._gpu_buffer_size = self.num_rows * self.data.shape[1]
# if(gpu_buffer._gpu_label_buffer is not None): gpu_buffer._gpu_label_buffer._free_device_memory()
if self.label is not None:
gpu_buffer._gpu_label_buffer = gp.zeros((self.num_rows, self.label.shape[1]))
def _gdata(self, data):
return gpu_buffer._gpu_buffer._overwrite(data)
def _glabel(self, label):
# return gp.garray(label) if label is not None else None
return gpu_buffer._gpu_label_buffer._overwrite(label) if label is not None else None
def _iter_random_buffer(self):
while True:
idx = np.random.randint(self.data.shape[0], size=(self.num_rows,))
gp_label_buffer = None if self.label is None else self._glabel(self.label[idx])
yield self._gdata(self.data[idx]), gp_label_buffer
def _iter_all_buffer(self):
num_buffers = int(self.data.shape[0] / self.num_rows) + 1 # +1 to handle possible remainder that smaller then buffer_size
(start, end) = (0, 0)
for i in range(num_buffers):
(start, end) = (i * self.num_rows, (i + 1) * self.num_rows)
if(end > self.data.shape[0]): end = self.data.shape[0]
if(start < self.data.shape[0]):
gp_label_buffer = None if self.label is None else self._glabel(self.label[start:end])
yield self._gdata(self.data[start:end]), gp_label_buffer
def iter_minibatch(self):
buffer_iter = self._iter_random_buffer() if self.randomize else self._iter_all_buffer()
for data_buffer, label_buffer in buffer_iter:
num_batches_per_buffer = int(data_buffer.shape[0] / self.batch_size) + 1 # +1 to handler overflow
for i in range(num_batches_per_buffer):
(start, end) = (i * self.batch_size, (i + 1) * self.batch_size)
if(end > data_buffer.shape[0]): end = data_buffer.shape[0]
if(start < data_buffer.shape[0]):
gp_label_batch = None if label_buffer is None else label_buffer[start : end]
yield data_buffer[start : end], gp_label_batch
class cpu_buffer(object):
''' A cpu memory buffer manager, which makes possible to use data set large than cpu memory:
1. use hdf5 file to reduce disk I/O as much as possible
2. make efficent cpu memory usage and fast disk i/o with hdf5 data file
3. works perfectly with SSD disk (e.g. 256GB), as replacement for expensive RAM
'''
def __init__(self, dataset, label=None, cpu_buffer_size_MB=4096):
'''dataset is a h5py data_set, resident in disk and support slicing into numpy array (resident in cpu memory)'''
self.dataset = dataset
self.label = label
nsize = int(cpu_buffer_size_MB * 1024 * 1024 / 4) # float32 take 4 bytes
self.num_rows = int(nsize / self.dataset.shape[1]) # maximum number of rows can hold in mem
if self.dataset.shape[0] <= self.num_rows: self.dataset = self.dataset[...] # load whole dataset to memory if dataset is small enough
self.num_buffers = int(self.dataset.shape[0] / self.num_rows) + 1 # +1 to handle possible remainder that smaller then buffer_size
def iter_random_mem(self):
mem_size = min(self.num_rows, self.dataset.shape[0])
while True:
idx = np.random.randint(self.dataset.shape[0], size=(mem_size,))
u, indices = np.unique(idx, return_inverse=True)
label = None if self.label is None else self.label[idx]
sys.stdout.write(".")
yield self.dataset[u, :][indices], label # h5py data_set can only be sliced this way for sampling with replacement
def iter_all_mem(self):
(start, end) = (0, 0)
for i in range(self.num_buffers):
(start, end) = (i * self.num_rows, (i + 1) * self.num_rows)
if(end > self.dataset.shape[0]): end = self.dataset.shape[0]
if(start < self.dataset.shape[0]):
label = None if self.label is None else self.label[start:end]
sys.stdout.write(".")
yield self.dataset[start:end], label
class cpu_gpu_buffer(cpu_buffer):
'''
A cpu gpu memory buffer manager, which makes efficient usage of both cpu and gpu memory
'''
def __init__(self, data, label=None, batch_size=128, randomize=False, gpu_buffer_size_MB=512, cpu_buffer_size_MB=4096, inverse_disk_io_rate=1):
'''inverse_disk_io_rate>1, controls how less frequently we want to do disk io, with the cost of reusing smaller data in mem,
inverse_disk_io_rate = 1 means we always want fresh data once all data in mem has been consumed'''
if(not isinstance(data, h5py.Dataset)):
raise ValueError("Input data is not of h5py.Dataset type")
num_gpu_buffers = int(cpu_buffer_size_MB / gpu_buffer_size_MB)
cpu_buffer_size_MB = num_gpu_buffers * gpu_buffer_size_MB # enforce that cpu_buffer_size is divisible by gpu_buffer_size_MB
super(cpu_gpu_buffer, self).__init__(data, label, cpu_buffer_size_MB)
self.cpu_mem_iter = self.iter_random_mem() if randomize else self.iter_all_mem()
self.batch_size = batch_size
self.randomize = randomize
self.gpu_buffer_size_MB = gpu_buffer_size_MB
nsize = int(gpu_buffer_size_MB * 1024 * 1024 / 4) # float32 take 4 bytes
num_gpu_rows = int(nsize / data.shape[1])
self.num_batches = inverse_disk_io_rate * num_gpu_buffers * int(num_gpu_rows / batch_size)
def iter_minibatch(self):
for cpu_mem, cpu_label in self.cpu_mem_iter:
minibatch = gpu_buffer(cpu_mem, cpu_label, self.batch_size, self.randomize, self.gpu_buffer_size_MB).iter_minibatch()
for n, (data, label) in enumerate(minibatch):
yield data, label
if(n > self.num_batches): break # time to load a new piece of cpu_mem from disk
# may use this for prediction tasks, which is fast on cpu anyway
def iter_minibatch(data, batch_size):
bsize, batch_sizes = get_batches(data, batch_size)
num_batches = batch_sizes.shape[0]
for i in range(num_batches):
(start, end) = (i * bsize, i * bsize + batch_sizes[i])
yield data[start : end], (start, end)
def iter_minibatch_tuple(data, labels, batch_size):
bsize, batch_sizes = get_batches(data, batch_size)
num_batches = batch_sizes.shape[0]
for i in range(num_batches):
(start, end) = (i * bsize, i * bsize + batch_sizes[i])
yield data[start : end], labels[start : end], (start, end)
def sampleMinibatch(mbsz, inps, targs):
idx = np.random.randint(inps.shape[0], size=(mbsz,))
return inps[idx], targs[idx]
'''
Two of the most useful ways to standardize inputs are:
o Mean 0 and standard deviation 1
o Midrange 0 and range 2 (i.e., minimum -1 and maximum 1)
http://www.faqs.org/faqs/ai-faq/neural-nets/part2/
'''
def normalize(data, mu=None, sigma=None):
'''[0,1]
data normalization function
data : 2D array, each row is one data
mu : 1D array, each element is the mean the corresponding column in data
sigma: 1D array, each element is the standard deviation of the corresponding
column in data
'''
(m, n) = data.shape
if mu is None or sigma is None:
mu = np.mean(data, 0)
sigma = np.std(data, 0)
mu_rep = np.tile(mu, (m, 1))
sigma_rep = np.tile(sigma, (m, 1))
return (data - mu_rep) / sigma_rep, mu, sigma
def un_normalize(data, mu, sigma):
'''
un-normalize the normalized data. This is used for visualization purpose
data : 2D array, each row is one data
mu : 1D array, each element is the mean the corresponding column in data
sigma: 1D array, each element is the standard deviation of the corresponding
column in data
'''
(m, n) = data.shape
mu_rep = np.tile(mu, (m, 1))
sigma_rep = np.tile(sigma, (m, 1))
return np.multiply(data, sigma_rep) + mu_rep
'''
standardize the target variables, usually use normalize_minus_one or normalize_zero_one as below
'''
def normalize_zero_one(data, min=None, gap=None):
'''
normalize data to range [0,1], also for normalizing targets if activation function is sigmoid()
data : 2D array, each row is one data
min : 1D array, each element is the min of the corresponding column in data
gap: 1D array, each element is the max-min of the corresponding column in data
'''
(m, n) = data.shape
if min is None or gap is None:
min = np.min(data, 0)
max = np.max(data, 0)
gap = max - min
min_rep = np.tile(min, (m, 1))
gap_rep = np.tile(gap, (m, 1))
return (data - min_rep) / gap_rep, min, gap
def un_normalize_zero_one(data, min, gap):
'''
un-normalize the normalized data. This is used for visualization purpose
data : 2D array, each row is one data
min : 1D array, each element is the min of the corresponding column in data
gap: 1D array, each element is the max-min of the corresponding column in data
'''
(m, n) = data.shape
min_rep = np.tile(min, (m, 1))
gap_rep = np.tile(gap, (m, 1))
return np.multiply(data, gap_rep) + min_rep
def normalize_minus_one(data, min=None, gap=None):
''' [-1,1]
normalize data to range [-1,1], also for normalizing targets if activation function is tanh()
data : 2D array, each row is one data
min : 1D array, each element is the min of the corresponding column in data
gap: 1D array, each element is the max-min of the corresponding column in data
'''
zero_one, min, gap = normalize_zero_one(data, min, gap)
return 2.*zero_one - 1, min, gap
def un_normalize_minus_one(data, min, gap):
'''
un-normalize the normalized data. This is used for visualization purpose
data : 2D array, each row is one data
min : 1D array, each element is the min of the corresponding column in data
gap: 1D array, each element is the max-min of the corresponding column in data
'''
(m, n) = data.shape
max = min + gap
max_rep = np.tile(max, (m, 1))
return 0.5 * (un_normalize_zero_one(data, min, gap) + max_rep)
def normalize_identity(data, min=None, gap=None):
return data, min, gap
def un_normalize_identity(data, min=None, gap=None):
return data
normalizer_name_map = { # the mapping for data/label normalization
"None":(normalize_identity, un_normalize_identity),
"Gaussian":(normalize, un_normalize), # usually for data normalizer
"Uniform_Minus_One": (normalize_minus_one, un_normalize_minus_one), # usually for both data and target normalizer only
"Uniform_Zero_One": (normalize_zero_one, un_normalize_zero_one) # for target normalizer only
}
target_activation_normalizer_map = { # output activation with cooresponding target normalizer
"Sigmoid": (normalize_zero_one, un_normalize_zero_one),
"Softmax": (normalize_identity, un_normalize_identity),
"Tanh": (normalize_minus_one, un_normalize_minus_one),
"LeTanh": (normalize_minus_one, un_normalize_minus_one),
"Linear": (normalize_minus_one, un_normalize_minus_one)
}
def flattenFeatures(features):
a = array(features[0])
n = len(features)
for i in range(1, n):
a = append(a, features[i], axis=1)
return a
def num_mistakes(targetsMB, outputs):
return (outputs.argmax(axis=1) != targetsMB.argmax(axis=1)).sum()
def sum_absolute_error(targets, outputs):
if not isinstance(outputs, np.ndarray):
outputs = outputs.as_numpy_array()
if not isinstance(targets, np.ndarray):
targets = targets.as_numpy_array()
return np.abs(outputs - targets).sum()
def mean_absolute_error(targets, outputs):
return sum_absolute_error(targets, outputs) / targets.shape[0]
def sum_square_error(targets, outputs):
return (outputs - targets).euclid_norm() ** 2
def mean_square_error(targets, outputs):
return sum_square_error(targets, outputs) / targets.shape[0]
metric_map = {
"None":None,
"sum_absolute_error": sum_absolute_error,
"sum_square_error": sum_square_error,
"num_mistakes": num_mistakes
}
# mae = 0.0
# for i in range(targets.shape[0]):
# mae += mae*i/(i+1) + 1.0 * math.fabs(outputs[i]-targets[i])/(i+1)
# return mae
def load_tsv(filename, delimiter = '\t'):
return pd.read_csv(filename, sep = delimiter,error_bad_lines=False)
def save_tsv(filename, data, names = None, delimiter = '\t'):
writer = csv.writer(open(filename, "w"), lineterminator="\n", delimiter=delimiter)
if(names is not None):
writer.writerow(names)
writer.writerows(data)