/
process_data.py
120 lines (106 loc) · 3.82 KB
/
process_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
import numpy as np
from archipack import *
import os
import sys
try:
from skimage import io, transform
except:
pass
def count_files(pattern = "train", path = 'cancer_data/inputs'):
files = os.listdir(path)
total = 0
# Count number of files
for f in files:
if pattern in f:
total += 1
return total
# Read and transform
def read_data(batch_size = 2000, batch_number = 0, datatype = "train", size = 512):
files = os.listdir('cancer_data/inputs')
trd, trl = [], []
count = -1 # make sure 0 indexed
total = count_files(pattern = datatype)
batch_size = min(batch_size, total)
begin_batch = (batch_number*batch_size) % total
end_batch = (batch_number+1) * batch_size % total
if end_batch == 0:
end_batch = (batch_number+1) * batch_size
print(total, begin_batch, end_batch)
#for f in tqdm(files):
for f in files:
if datatype in f:
count += 1
#print(count, begin_batch, end_batch)
if begin_batch > end_batch:
if count >= end_batch and count < begin_batch:
continue
elif count not in range(begin_batch, end_batch):
continue
m = transform.resize(io.imread('cancer_data/inputs/' + f), (size,size,3), mode='constant')
n = transform.resize(io.imread('cancer_data/outputs/' + f), (size,size,3), mode='constant')
trd.append(whiten_data(m))
# Don't whiten labels, only keep 1 layer
trl.append(n[:,:,1])
#print(f)
#print(n[:,:,1])
if f == "pos_test_000072.png":
s = "" if size == 512 else str(size)
pickle(trd[-1], "testImageData"+s)
pickle(trl[-1], "testImageLabels"+s)
return trd, trl
def whiten_data(I):
I = I - I.mean()
I = I / np.std(I)
return I
def whiten_data_list(inputList):
for i, I in enumerate(inputList):
I = I - I.mean()
#print(I.shape)
#print(np.std(I))
I = I / np.std(I)
inputList[i] = I
return inputList
def pickle(myList, destination):
import pickle
with open(destination, 'wb') as f:
pickle.dump(myList, f, protocol=2)
def unpickle(destination):
import sys
if sys.version_info[0] >= 0:
import pickle
with open(destination, 'rb') as f:
myList = pickle.load(f)
return myList
def preprocess_data(datatype = 'test', io_batch = 2000, size= 512):
s = "" if size == 512 else str(size)
if datatype in ['test','both']:
test_data, test_labels = read_data(io_batch, 0, "test", size = size)
shuffleDataAndLabelsInPlace(test_data, test_labels)
pickle(test_data, "test_data"+s)
del test_data
pickle(test_labels, "test_labels"+s)
del test_labels
if datatype in ['train','both']:
train_data, train_labels = read_data(io_batch, 0, "train", size = size)
shuffleDataAndLabelsInPlace(train_data, train_labels)
pickle(train_labels, "train_labels"+s)
del train_labels
pickle(train_data, "train_data"+s)
del train_data
def load_data(size = 512):
s = "" if size == 512 else str(size)
test_labels = unpickle("test_labels"+s)
test_data = unpickle("test_data"+s)
train_data = unpickle("train_data"+s)
train_labels = unpickle("train_labels"+s)
return test_labels, test_data, train_data, train_labels
if __name__ == '__main__':
#trd, trl = read_data(100,50, "train")
#tstd, tstl = read_data(100,50, "test")
#trd = whiten_data(trd)
#train_data, train_labels = read_data(1, 0, "train")
#print(train_labels[0].shape)
preprocess_data('both', io_batch = 2000, size = 512)
#read_data(2000, 0, "test", size = 32)
#load_data(size = 32)
pass