/
core.py
335 lines (278 loc) · 12.8 KB
/
core.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
"""Core module.
This module contains the Core class, which is the one responsible for
executing and orchestrating the main Core functionalities.
"""
import logging
import os
import random
import re
import shutil
import signal
import sys
import time
import uuid
from io import StringIO
from operator import attrgetter
from pathlib import Path
from typing import List
import math
import pandas as pd
import psutil
from pympler import muppy, summary, refbrowser
from tqdm import tqdm
from constants import RunStatus
from data import store_data, upload_data, delete_data
from database import Database, Dataset
from metafeatures import MetaFeatures
from utilities import hash_file
from worker import AlgorithmError, Worker
LOGGER = logging.getLogger(__name__)
class Core(object):
_LOOP_WAIT = 5
def __init__(
self,
# SQL Conf
dialect: str = 'sqlite',
database: str = 'assets/ml-base.db',
username: str = None,
password: str = None,
host: str = None,
port: int = None,
query=None,
# Generic Conf
work_dir: str = None,
timeout: int = None,
cache_percentage: float = 0.99,
dataset_budget: int = None,
max_pipeline_depth: int = 5,
# Worker Conf
complete_pipelines: bool = False,
complete_pipeline_samples: int = 20,
affinity: bool = False,
# S3 Conf
service_account: str = None,
bucket: str = None,
# Log Conf
verbose_metrics: bool = False,
):
self.metafeatures = MetaFeatures()
self.db = Database(dialect, database, username, password, host, port, query)
self.work_dir = work_dir
self.timeout = timeout
self.dataset_budget = dataset_budget
self.complete_pipelines = complete_pipelines
self.complete_pipeline_samples = complete_pipeline_samples
self.max_pipeline_depth = max_pipeline_depth
self.affinity = affinity
self.s3_config: str = service_account
self.s3_bucket: str = bucket
self.verbose_metrics: bool = verbose_metrics
self._abort = False
LOGGER.info('Scanning cache dir. This may take some while...')
Path(self.work_dir).mkdir(parents=True, exist_ok=True)
self.cache_total, self.cache_used, free = shutil.disk_usage(self.work_dir)
self.cache_percentage = cache_percentage
def add_dataset(self, df: pd.DataFrame, class_column: str, depth: int, budget: int = None, name: str = None):
"""Add a new dataset to the Database.
Args:
df (DataFrame):
The input dataset.
class_column (str):
The class column of the input dataset which is to be predicted.
name (str):
Name given to this dataset. Optional. If not given, a random uuid will be
generated from the training_path and used as the dataset name.
depth (int):
The max pipeline depth a dataset can reach.
budget (int):
"""
"""Generate name using a random uuid, if input dataset has no name"""
if not name or name.strip() == '':
name = str(uuid.uuid4())
LOGGER.info('Creating dataset {}'.format(name))
"""Stores input dataset to local working directory"""
local_file = self._cache_locally(df, name)
"""Check if new dataset equals existing dataset. If False store transformed dataset to DB"""
hashcode = hash_file(local_file)
similar_datasets: List[Dataset] = self.db.get_datasets_by_hash(hashcode)
for ds in similar_datasets:
df_old = ds.load(self.s3_config, self.s3_bucket)
if df.equals(df_old):
LOGGER.info('New dataset equals dataset {} and is not stored in the DB.'.format(ds.id))
delete_data(local_file)
return ds
del df_old
"""Uploads input dataset to cloud"""
upload_data(local_file, self.s3_config, self.s3_bucket, name)
"""Calculates metafeatures for input dataset"""
try:
LOGGER.info('Extracting meta-features...')
mf, success = self.metafeatures.calculate(df=df, class_column=class_column)
for key, value in mf.items():
if math.isinf(value):
LOGGER.info(
'Value of Meta Feature "{}" is infinite and is replaced by constant value'.format(key))
if value > 0:
mf[key] = sys.maxsize
else:
mf[key] = -sys.maxsize
except ValueError as ex:
LOGGER.exception('Failed to compute meta-features. Fallback to empty meta-features', ex)
mf, success = {}, False
if not success:
LOGGER.info('Meta-feature extraction failed. Marking this dataset as \'skipped\'')
mf['status'] = RunStatus.SKIPPED
delete_data(local_file)
"""Saves input dataset and calculated meta-features to db"""
if budget is None:
budget = self.dataset_budget
return self.db.create_dataset(
train_path=local_file,
name=name,
class_column=class_column,
depth=depth,
budget=budget,
hashcode=hashcode,
**mf
)
def _cache_locally(self, df: pd.DataFrame, name: str) -> str:
def clean_cache():
LOGGER.info('Cleaning cache. This may take some while...')
# For complete local execution do not clean cache!
exit(-1)
shutil.rmtree(self.work_dir)
Path(self.work_dir).mkdir(parents=True, exist_ok=True)
self.cache_total, self.cache_used, free = shutil.disk_usage(self.work_dir)
LOGGER.info('Deleted local cache. Using {} of cache'.format(self.cache_used / self.cache_total))
try:
local_file = store_data(df, self.work_dir, name)
except IOError:
clean_cache()
local_file = store_data(df, self.work_dir, name)
self.cache_used += os.stat(local_file).st_size
if self.cache_used / self.cache_total > self.cache_percentage:
clean_cache()
return local_file
def _user_abort(self):
LOGGER.info('Received abort signal. Stopping processing after current evaluation...')
self._abort = True
def work(self, use_defaults=True, choose_randomly=True, wait=True, verbose=False):
"""Get unfinished Datasets from the database and work on them.
Args:
use_defaults (bool):
<MISSING>
choose_randomly (bool):
If ``True``, work on all the highest-priority datasets in random order.
Otherwise, work on them in sequential order (by ID).
Optional. Defaults to ``True``.
wait (bool):
If ``True``, wait for more datasets to be inserted into the Database
once all have been processed. Otherwise, exit the worker loop
when they ds out.
Optional. Defaults to ``False``.
verbose (bool):
Whether to be verbose about the process. Optional. Defaults to ``True``.
"""
signal.signal(signal.SIGUSR1, lambda s, frame: self._user_abort())
# ##########################################################################
# # Main Loop ############################################################
# ##########################################################################
failure_counter = 0
# Count number of running workers
pids = set()
core = None
if self.affinity:
for p in psutil.process_iter():
if re.match('.*python\\d?', p.name()) and 'worker' in p.cmdline() and \
len([arg for arg in p.cmdline() if arg.endswith('cli.py')]) > 0:
if p.parent() is None or p.parent().pid not in pids:
pids.add(p.pid)
core = {len(pids) - 1 % os.cpu_count()}
LOGGER.info('Setting affinity to {}'.format(core))
while True:
if self._abort:
LOGGER.info("Stopping processing due to user request")
break
ds = None
if use_defaults:
ds = self.db.select_dataset()
else:
# Get all pending and running datasets, or all pending/running datasets from the list we were given
datasets = self.db.get_datasets()
if len(datasets) > 0:
# Either choose a dataset randomly between priority, or take the dataset with the lowest ID"""
if choose_randomly:
ds = random.choice(datasets)
else:
ds = sorted(datasets, key=attrgetter('id'))[0]
del datasets
try:
self.db.mark_dataset_running(ds.id)
except UserWarning:
LOGGER.warning('Skipping completed dataset: {}'.format(ds.id))
if not ds:
if wait:
LOGGER.debug('No datasets found. Sleeping %d seconds and trying again.', self._LOOP_WAIT)
time.sleep(self._LOOP_WAIT)
continue
else:
LOGGER.info('No datasets found. Exiting.')
break
LOGGER.info('Computing on dataset {}'.format(ds.id))
worker = None
"""
Progress bar
"""
try:
pbar = tqdm(total=ds.budget, ascii=True, initial=ds.processed, disable=not verbose)
"""Creates Worker"""
worker = Worker(self.db, ds, self, timeout=self.timeout,
s3_config=self.s3_config, s3_bucket=self.s3_bucket,
complete_pipelines=self.complete_pipelines,
complete_pipeline_samples=self.complete_pipeline_samples,
max_pipeline_depth=self.max_pipeline_depth,
affinity=core,
verbose_metrics=self.verbose_metrics)
"""Call run_algorithm as long as the chosen dataset is marked as RUNNING"""
while ds.status == RunStatus.RUNNING:
if use_defaults:
worker.run_default()
self.db.mark_dataset_complete(ds.id)
delete_data(ds.train_path)
break
success = worker.run_algorithm()
ds = self.db.get_dataset(ds.id)
if verbose and ds.processed > pbar.last_print_n:
pbar.update(ds.processed - pbar.last_print_n)
# Safety valve to abort execution if something is broken
if success is False:
LOGGER.error('Something went wrong. Sleeping {} seconds.'.format(self._LOOP_WAIT))
time.sleep(self._LOOP_WAIT)
failure_counter += 1
if failure_counter > 10:
LOGGER.fatal('Received 10 consecutive unexpected exceptions. Aborting evaluation.')
# We occasionally encounter OSError: [Errno 12] Cannot allocate memory. To debug the memory
# leak the current heap allocation is logged
all_objects = muppy.get_objects()
LOGGER.fatal('Heap Dump:\n' + '\n'.join(summary.format_(summary.summarize(all_objects))))
buffer = StringIO()
cb = refbrowser.StreamBrowser(self, maxdepth=4, str_func=lambda o: str(type(o)),
stream=buffer)
cb.print_tree()
LOGGER.fatal('References:\n' + buffer.getvalue())
sys.exit(1)
else:
failure_counter = 0
pbar.close()
except AlgorithmError:
"""
The exception has already been handled; just wait a sec so we don't go out of control reporting errors
"""
LOGGER.error('Something went wrong. Sleeping {} seconds.'.format(self._LOOP_WAIT))
time.sleep(self._LOOP_WAIT)
finally:
del worker
def export_pipelines(self):
return self.db.export_pipelines()
def export_datasets(self):
return self.db.export_datasets()