-
Notifications
You must be signed in to change notification settings - Fork 0
/
Construct.py
218 lines (192 loc) · 9.61 KB
/
Construct.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
import os
from Select import Select
from Common import Common
from tqdm import tqdm
import random
import multiprocessing
import Download
from functools import partial
class Construct:
def __init__(self, image_level=None):
""" Builds the object
Parameters
----------
image_level : bool
Whether using image-level dataset, set to False if using bounding boxes, by default is True
"""
self.image_level = image_level
self.common = Common(self.image_level)
def build_images_csv(self, image_labels_file, image_ids_file, new_folder, root_dir, classes, boxes_file=None):
""" Given an image labels file and image ids file, builds a new one containing only the specified classes in the new
specified folder
Parameters
----------
image_labels_file : str
Name of csv files with image labels, typically "XXX-annotations-human-imagelabels.csv"
image_ids_file : str
Name of csv files with image information, typically "XXX-images-with-labels-with-rotation.csv"
boxes_file : str
Optional (for use with bounding boxes), name of csv files with bounding boxes information, typically
"XXX-annotations-bbox.csv"
new_folder : str
New folder to place new CSV's
root_dir : str
Root directory contianing csv files and new folder
classes : set of str
Set of classes to be kept
"""
image_labels_path = os.path.join(root_dir, image_labels_file)
print("Selecting images to keep")
images_to_keep = Select.select_images_with_class(image_labels_path, classes)
to_process = [image_labels_file, image_ids_file]
if boxes_file:
to_process.append(boxes_file)
for csv_file_name in to_process:
print("Saving rows for {}".format(csv_file_name))
old_path = os.path.join(root_dir, csv_file_name)
new_path = os.path.join(root_dir, new_folder, csv_file_name)
old_c = self.common.load_csv_as_dict(old_path)
new_c = self.common.new_csv_as_dict(new_path, old_c.fieldnames)
rows = []
for row in tqdm(old_c):
if row['ImageID'] in images_to_keep and (
row.get('LabelName') is None or row.get('LabelName') in classes):
rows.append(row)
new_c.writeheader()
new_c.writerows(rows)
def classes_subset(self, classes, new_folder, root_dir):
""" Given a set of classes to keep and the locations of CSV's, builds a new directory where only the specified
classes are kept
Parameters
----------
classes : iterable of str
Set of class ids to be kept e.g. /m/02wbm (corresponding to food), class ids can be found in the class names
metadata file which can be downloaded from the Open Images website
new_folder : str
New folder to place new CSV's
root_dir : str
Root directory contianing csv files and new folder
"""
os.mkdir(os.path.join(root_dir, new_folder))
if not isinstance(classes, type):
classes = set(classes)
for subset in ["train", "validation", "test"]:
print("Building new CSVs for {}".format(subset))
image_labels_file = self.common.get_image_labels_file(subset)
image_ids_file = self.common.get_image_ids_file(subset)
boxes_file = None
if not self.image_level:
boxes_file = self.common.get_boxes_file(subset)
self.build_images_csv(image_labels_file, image_ids_file, new_folder, root_dir, classes,
boxes_file=boxes_file)
def random_classes_subset(self, new_folder, root_dir, n, seed=None):
""" Samples n random classes and builds a new dataset in new_folder where only the specified classes are present
Parameters
----------
new_folder : str
New folder to place new CSV's
root_dir : str
Root directory contianing csv files and new folder
n : int
Number of classes to select
seed : int
Seed for random number generator
"""
classes_file = 'classes-trainable.txt'
classes_path = os.path.join(root_dir, classes_file)
print("Selecting random sample of {} classes".format(n))
classes = Select.select_random_classes(classes_path, n, seed=seed)
self.classes_subset(classes, new_folder, root_dir)
new_classes_path = os.path.join(root_dir, new_folder, classes_file)
Common.new_text_file(new_classes_path, classes)
def images_sample(self, new_folder, root_dir, ns, n_jobs=None, fix_rotation=None, resize=None, required_columns=None,
seed=None, attempts=None, timeout=None, wait=None, common_download_errors=None):
""" Samples n random images and builds a new dataset in new_folder where only the specified classes are present
Parameters
----------
new_folder : str
New folder to place new CSV's
root_dir : str
Root directory containing csv files and new folder
ns : tuple of int
Number of images to select for (training, validation, test) respectively, if any are None all are sampled
n_jobs : int
Number of images to download in parallel at once. Default of 9, as there are around 9 farms, so this means
on average we'll only be making 1 request to a farm at a time
fix_rotation : bool
Whether to fix the rotation of the image, by default true, see here for more information
https://storage.googleapis.com/openimages/web/2018-05-17-rotation-information.html
resize : bool
Whether to resize images as described in the Faster RCNN paper, and discussed here
https://github.com/tensorflow/models/issues/1794#issuecomment-311569473 . Benefit is reduces storage space
without effecting training if using the FasterRCNN Inception ResNet V2 architecture. Default is False.
required_columns : list of str
Set of columns required to not be the empty string for the row to be included in the sample
seed : int
Seed for random number generator
attempts : int
Maximum number of attempts to try downloading an image
timeout : float
Timeout in seconds for a request
wait : float
Time to wait after a failed download attempt
common_download_errors : bool
Whether to show common expected download error (HTTP 404 and 410) messages, default False
"""
seed = seed or 0
n_jobs = n_jobs or 9
fix_rotation = fix_rotation if fix_rotation is not None else True
new_root = os.path.join(root_dir, new_folder)
images_folder = os.path.join(new_root, "images")
if not os.path.isdir(new_root):
os.mkdir(new_root)
if not os.path.isdir(images_folder):
os.mkdir(images_folder)
subsets = ["train", "validation", "test"]
for i in range(len(subsets)):
subset = subsets[i]
n = ns[i]
print("Building subset for {}".format(subset))
image_ids_file = self.common.get_image_ids_file(subset)
image_ids_path = os.path.join(root_dir, image_ids_file)
print("Loading images rows")
rows = Select.get_rows(image_ids_path, required_columns=required_columns)
selected_image_ids = set()
subset_path = os.path.join(images_folder, subset)
if not os.path.isdir(subset_path):
os.mkdir(subset_path)
if n is None:
n = len(rows)
else:
print("Selecting {} rows".format(n))
random.seed(seed)
random.shuffle(rows)
pos = 0
pool = multiprocessing.Pool(n_jobs)
downloader = partial(Common.pass_args_to_f,
partial(Download.download_image, images_folder, resize=resize, download_folder=subset,
attempts=attempts, timeout=timeout, wait=wait,
common_download_errors=common_download_errors))
req = n
print("Downloading images")
while req > 0:
args = [[row["ImageID"], row["OriginalMD5"], row["OriginalURL"],
int(float(row["Rotation"])) if fix_rotation and row["Rotation"] is not '' else None]
for row in rows[pos: pos + req]]
successful_download_ids = tqdm(pool.imap(downloader, args))
for image_id in successful_download_ids:
if image_id is not None:
selected_image_ids.add(image_id)
pos += req
available = len(rows) - pos
failed = n - len(selected_image_ids)
req = min(failed, available)
if req > 0:
print("Failed to download {} images, trying to download next {} instead".format(failed, req))
image_labels_file = self.common.get_image_labels_file(subset)
to_process = [image_ids_file, image_labels_file]
if not self.image_level:
to_process.append(self.common.get_boxes_file(subset))
print("Creating new CSVs for subset")
for csv_file in to_process:
Common.copy_rows_on_image_id(root_dir, new_folder, csv_file, selected_image_ids)