def run(self): self.logger = log.get_logger('Load') self.process_options() self.validate_conditions() self.logger.debug("Produced spec:\n%s", json.pformat(self.job.spec)) self.queue_job()
def run(self): self.logger = log.get_logger('CancelJob') self.tasks = Tasks() rows_affected = 0 if self.options.multiple: rows_affected = self.tasks.bulk_finish(extra_predicate=("job_id LIKE :job_id", { 'job_id': self.options.job_id + '%%' })) else: loader_storage = LoaderStorage() with loader_storage.transaction() as cursor: jobs = apsw_helpers.query(cursor, ''' SELECT id FROM jobs WHERE id LIKE :job_id ''', job_id=self.options.job_id + '%') if len(jobs) > 1: print len(jobs), 'jobs match this job ID:' print '\n'.join([ row.id for row in jobs ]) print 'Please use a more specific prefix or specify the `--multiple` flag if you' print 'would like to cancel more than one job.' sys.exit(1) elif len(jobs) == 0: print '0 jobs match this job ID.' sys.exit(1) else: rows_affected = self.tasks.bulk_finish(extra_predicate=("job_id = :job_id", { 'job_id': jobs[0].id })) job_suffix = '(s)' if self.options.multiple else '' task_suffix = 's' if not rows_affected == 1 else '' print CANCEL_JOB_MESSAGE % (job_suffix, self.options.job_id, rows_affected, task_suffix)
def run(self): signal.signal(signal.SIGINT, self.stop) signal.signal(signal.SIGQUIT, self.stop) signal.signal(signal.SIGTERM, self.stop) self.exiting = False self.logger = log.get_logger('Server') # switch over to the correct user as soon as possible if self.options.set_user is not None: if not setuser(self.options.set_user): self.logger.error('failed to switch to user %s' % self.options.set_user) sys.exit(1) if self.options.daemonize: # ensure connection pool forks from daemon pool.close_connections() with storage.LoaderStorage.fork_wrapper(): daemonize(self.options.log_path) pool.recreate_pool() # record the fact that we've started successfully self.servers = Servers() self.servers.ping() if self.options.num_workers > WORKER_WARN_THRESHOLD and not self.options.force_workers: if not cli_utils.confirm( 'Are you sure you want to start %d workers? This is potentially dangerous.' % self.options.num_workers, default=False): print 'Exiting.' sys.exit(1) self.logger.debug('Starting worker pool') self.pool = WorkerPool(num_workers=self.options.num_workers) print 'MemSQL Loader Server running' loader_db_name = storage.MEMSQL_LOADER_DB has_valid_loader_db_conn = False while not self.exiting: try: if bootstrap.check_bootstrapped(): has_valid_loader_db_conn = True self.pool.poll() self.servers.ping() time.sleep(1) else: if has_valid_loader_db_conn: self.logger.warn( 'The %s database is unreachable or not ready; stopping worker pool', loader_db_name) self.pool.stop() has_valid_loader_db_conn = False time.sleep(5) except KeyboardInterrupt: break self.stop()
def run(self): self.logger = log.get_logger('Jobs') self.jobs_api = JobsApi() try: result = self.jobs_api.query({ k: v for k, v in { 'state': self.options.state, 'order': self.options.order, 'order_by': self.options.order_by, 'page': self.options.page, 'page_size': self.options.page_size }.iteritems() if v }) except exceptions.ApiException as e: print e.message sys.exit(1) if result: tablefmt = TableFormat.JSON if self.options.json else TableFormat.TABLE columns = JobsApi.SORTABLE_COLUMNS + ["database", "table"] for job in result: job["database"] = job.spec["target"]["database"] job["table"] = job.spec["target"]["table"] print PrettyPrinter(result, columns=columns, format=tablefmt, align={ "database": "l", "table": "l", }).format() else: print 'No jobs found that match this query' sys.exit(1)
def run(self): self.logger = log.get_logger('Load') self.process_options() self.validate_conditions() self.logger.debug("Produced spec:\n%s", json.pformat(self.job.spec)) self.queue_job()
def build_spec(base_spec, options): # for each part in the base_spec, we expect one of two # things to be exposed in the options -> either the key name # itself or full-schema-path-to-keyname. logger = log.get_logger('Schema') return build_spec_recursive(logger, options, base_spec, get_spec_validator(), [])
def run(self): self.logger = log.get_logger('Tasks') self.tasks_api = TasksApi() if not self.options.job_id and not self.options.last_job: print 'You must specify a job ID or use the --last-job option.' sys.exit(1) if self.options.last_job: jobs = Jobs() job_list = jobs.all() if not job_list: print 'No jobs found.' sys.exit(1) self.options.job_id = job_list[-1].id try: result = self.tasks_api.query({ k: v for k, v in { 'job_id': self.options.job_id, 'state': self.options.state, 'order': self.options.order, 'order_by': self.options.order_by, 'page_size': self.options.page_size, 'page': self.options.page, }.iteritems() if v }) except exceptions.ApiException as e: print e.message sys.exit(1) if result: tablefmt = TableFormat.JSON if self.options.json else TableFormat.TABLE print PrettyPrinter(result, columns=TasksApi.SORTABLE_COLUMNS, format=tablefmt).format() else: print 'No tasks found that match this query' sys.exit(1)
def __init__(self, num_workers=None): self.logger = log.get_logger('WorkerPool') self.num_workers = num_workers or max( 1, int(multiprocessing.cpu_count() * 0.8)) self._workers = [] self.pid = os.getpid() self._worker_lock = multiprocessing.Lock()
def __init__(self, worker_sleep, parent_pid, worker_lock): self.worker_id = uuid.uuid1().hex[:8] self.worker_sleep = worker_sleep self.worker_lock = worker_lock self.parent_pid = parent_pid self._exit_evt = multiprocessing.Event() self.logger = log.get_logger('worker[%s]' % self.worker_id) super(Worker, self).__init__(name=('worker-%s' % self.worker_id))
def __init__(self, worker_sleep, parent_pid, worker_lock): self.worker_id = uuid.uuid1().hex[:8] self.worker_sleep = worker_sleep self.worker_lock = worker_lock self.parent_pid = parent_pid self._exit_evt = multiprocessing.Event() self.logger = log.get_logger('worker[%s]' % self.worker_id) super(Worker, self).__init__(name=('worker-%s' % self.worker_id))
def run(self): self.logger = log.get_logger('CancelTask') self.tasks = Tasks() rows_affected = self.tasks.bulk_finish(extra_predicate=('id = :task_id', { 'task_id': self.options.task_id })) plural = not rows_affected == 1 print 'Cancelled', rows_affected, 'task%s.' % ('s' if plural else '')
def __init__(self, num_workers=None, idle_timeout=None): self.logger = log.get_logger('WorkerPool') self.num_workers = num_workers or max(1, int(multiprocessing.cpu_count() * 0.8)) self.idle_timeout = idle_timeout self._workers = [] self.pid = os.getpid() self._worker_lock = multiprocessing.Lock() self._last_work_time = time.time()
def __init__(self, num_workers=None, idle_timeout=None): self.logger = log.get_logger('WorkerPool') self.num_workers = num_workers or max( 1, int(multiprocessing.cpu_count() * 0.8)) self.idle_timeout = idle_timeout self._workers = [] self.pid = os.getpid() self._worker_lock = multiprocessing.Lock() self._last_work_time = time.time()
def __init__(self): super(Downloader, self).__init__() self.logger = log.get_logger('downloader') self._error = None self._tb = None self._should_exit = False self._last_size = -1 self._last_download_time = 0
def __init__(self): super(Downloader, self).__init__() self.logger = log.get_logger('downloader') self._error = None self._tb = None self._should_exit = False self._last_size = -1 self._last_download_time = 0
def run(self): self.logger = log.get_logger('CancelTask') self.tasks = Tasks() rows_affected = self.tasks.bulk_finish( extra_predicate=('id = :task_id', { 'task_id': self.options.task_id })) plural = not rows_affected == 1 print 'Cancelled', rows_affected, 'task%s.' % ('s' if plural else '')
def run(self): signal.signal(signal.SIGINT, self.stop) signal.signal(signal.SIGQUIT, self.stop) signal.signal(signal.SIGTERM, self.stop) self.exiting = False self.logger = log.get_logger('Server') # switch over to the correct user as soon as possible if self.options.set_user is not None: if not setuser(self.options.set_user): self.logger.error('failed to switch to user %s' % self.options.set_user) sys.exit(1) if self.options.daemonize: # ensure connection pool forks from daemon pool.close_connections() with storage.LoaderStorage.fork_wrapper(): daemonize(self.options.log_path) pool.recreate_pool() # record the fact that we've started successfully self.servers = Servers() self.servers.ping() if self.options.num_workers > WORKER_WARN_THRESHOLD and not self.options.force_workers: if not cli_utils.confirm('Are you sure you want to start %d workers? This is potentially dangerous.' % self.options.num_workers, default=False): print 'Exiting.' sys.exit(1) self.logger.debug('Starting worker pool') self.pool = WorkerPool(num_workers=self.options.num_workers) print 'MemSQL Loader Server running' loader_db_name = storage.MEMSQL_LOADER_DB has_valid_loader_db_conn = False while not self.exiting: try: if bootstrap.check_bootstrapped(): has_valid_loader_db_conn = True self.pool.poll() self.servers.ping() time.sleep(1) else: if has_valid_loader_db_conn: self.logger.warn('The %s database is unreachable or not ready; stopping worker pool', loader_db_name) self.pool.stop() has_valid_loader_db_conn = False time.sleep(5) except KeyboardInterrupt: break self.stop()
def run(self): self.logger = log.get_logger('Task') self.task_api = TaskApi() try: result = self.task_api.query({ 'task_id': self.options.task_id }) except exceptions.ApiException as e: print e.message sys.exit(1) result = { k: str(v) if isinstance(v, SuperEnum.Element) else v for k, v in result.items() } print json.dumps(result, sort_keys=True, indent=4 * ' ')
def run(self): self.logger = log.get_logger('Processes') self.error = False self.KEY_FN = self.JOBS_KEY_FN if self.options.jobs else self.TASKS_KEY_FN if self.options.watch: # Takes care of setup and tear-down curses.wrapper(self.main_cli) else: print self.get_ps_str() if self.error: sys.exit(1)
def run(self): self.logger = log.get_logger('Processes') self.error = False self.KEY_FN = self.JOBS_KEY_FN if self.options.jobs else self.TASKS_KEY_FN if self.options.watch: # Takes care of setup and tear-down curses.wrapper(self.main_cli) else: print self.get_ps_str() if self.error: sys.exit(1)
def run(self): self.logger = log.get_logger('loader') # Because self._conn was passed in from the worker thread, we need # to call the mysql_thread_init() C function to make sure that # everything is initialized properly. However, _mysql doesn't expose # that function, so we call it implicitly by creating a MySQL # connection with a socket that's guaranteed to be invalid. try: _mysql.connect(unix_socket='.') except _mysql.MySQLError: pass try: self.logger.info('Starting loader') try: with self._conn_lock: self._active_conn_id = self._conn.thread_id() with self._task.protect(): self._task.data['conn_id'] = self._active_conn_id self._task.save() row_count = self._conn.query(self._sql, *self._params) finally: with self._conn_lock: self._active_conn_id = None with self._task.protect(): self._task.data['row_count'] = row_count self._task.save() except connection_wrapper.ConnectionWrapperException as e: self.logger.error('LOAD DATA connection error: %s', str(e)) self._set_error(ConnectionException(str(e))) except pool.MySQLError as e: errno, msg = e.args msg = "LOAD DATA error (%d): %s" % (errno, msg) self.logger.error(msg) self._set_error(WorkerException(msg)) except Exception as e: self._set_error(e) except KeyboardInterrupt: self.logger.info('Received KeyboardInterrupt, exiting...') finally: self._fifo.detach_reader() self.logger.info('Finished LOAD_DATA')
def run(self): self.logger = log.get_logger('loader') # Because self._conn was passed in from the worker thread, we need # to call the mysql_thread_init() C function to make sure that # everything is initialized properly. However, _mysql doesn't expose # that function, so we call it implicitly by creating a MySQL # connection with a socket that's guaranteed to be invalid. try: _mysql.connect(unix_socket='.') except _mysql.MySQLError: pass try: self.logger.info('Starting loader') try: with self._conn_lock: self._active_conn_id = self._conn.thread_id() with self._task.protect(): self._task.data['conn_id'] = self._active_conn_id self._task.save() row_count = self._conn.query(self._sql, *self._params) finally: with self._conn_lock: self._active_conn_id = None with self._task.protect(): self._task.data['row_count'] = row_count self._task.save() except connection_wrapper.ConnectionWrapperException as e: self.logger.error('LOAD DATA connection error: %s', str(e)) self._set_error(ConnectionException(str(e))) except pool.MySQLError as e: errno, msg = e.args msg = "LOAD DATA error (%d): %s" % (errno, msg) self.logger.error(msg) self._set_error(WorkerException(msg)) except Exception as e: self._set_error(e) except KeyboardInterrupt: self.logger.info('Received KeyboardInterrupt, exiting...') finally: self._fifo.detach_reader() self.logger.info('Finished LOAD_DATA')
def run(self): self.logger = log.get_logger('Task') self.task_api = TaskApi() try: result = self.task_api.query({'task_id': self.options.task_id}) except exceptions.ApiException as e: print e.message sys.exit(1) result = { k: str(v) if isinstance(v, SuperEnum.Element) else v for k, v in result.items() } print json.dumps(result, sort_keys=True, indent=4 * ' ')
def bootstrap(force=False): logger = log.get_logger('Bootstrap') # noqa def write_log(title, name, msg): log_title_width = 28 title = ("%s [%s]: " % (title, name)).rjust(log_title_width, ' ') logger.info(title + msg) write_log('Database', storage.MEMSQL_LOADER_DB, 'Checking...') if force: write_log('Database', storage.MEMSQL_LOADER_DB, 'Dropping...') storage.LoaderStorage.drop_database() write_log('Database', storage.MEMSQL_LOADER_DB, 'Ready.') for Model in MODELS.values(): instance = Model() if not instance.ready(): write_log('Table', Model.__name__, 'Bootstrapping...') instance.setup() write_log('Table', Model.__name__, 'Ready.')
def bootstrap(force=False): logger = log.get_logger('Bootstrap') # noqa def write_log(title, name, msg): log_title_width = 28 title = ("%s [%s]: " % (title, name)).rjust(log_title_width, ' ') logger.info(title + msg) write_log('Database', storage.MEMSQL_LOADER_DB, 'Checking...') if force: write_log('Database', storage.MEMSQL_LOADER_DB, 'Dropping...') storage.LoaderStorage.drop_database() write_log('Database', storage.MEMSQL_LOADER_DB, 'Ready.') for Model in MODELS.values(): instance = Model() if not instance.ready(): write_log('Table', Model.__name__, 'Bootstrapping...') instance.setup() write_log('Table', Model.__name__, 'Ready.')
def run(self): self.logger = log.get_logger('CancelJob') self.tasks = Tasks() rows_affected = 0 if self.options.multiple: rows_affected = self.tasks.bulk_finish( extra_predicate=("job_id LIKE :job_id", { 'job_id': self.options.job_id + '%%' })) else: loader_storage = LoaderStorage() with loader_storage.transaction() as cursor: jobs = apsw_helpers.query(cursor, ''' SELECT id FROM jobs WHERE id LIKE :job_id ''', job_id=self.options.job_id + '%') if len(jobs) > 1: print len(jobs), 'jobs match this job ID:' print '\n'.join([row.id for row in jobs]) print 'Please use a more specific prefix or specify the `--multiple` flag if you' print 'would like to cancel more than one job.' sys.exit(1) elif len(jobs) == 0: print '0 jobs match this job ID.' sys.exit(1) else: rows_affected = self.tasks.bulk_finish( extra_predicate=("job_id = :job_id", { 'job_id': jobs[0].id })) job_suffix = '(s)' if self.options.multiple else '' task_suffix = 's' if not rows_affected == 1 else '' print CANCEL_JOB_MESSAGE % (job_suffix, self.options.job_id, rows_affected, task_suffix)
def run(self): self.logger = log.get_logger('Jobs') self.jobs_api = JobsApi() try: result = self.jobs_api.query({ k: v for k, v in { 'state': self.options.state, 'order': self.options.order, 'order_by': self.options.order_by, 'page': self.options.page, 'page_size': self.options.page_size }.iteritems() if v }) except exceptions.ApiException as e: print e.message sys.exit(1) if result: tablefmt = TableFormat.JSON if self.options.json else TableFormat.TABLE columns = JobsApi.SORTABLE_COLUMNS + ["database", "table"] for job in result: job["database"] = job.spec["target"]["database"] job["table"] = job.spec["target"]["table"] print PrettyPrinter(result, columns=columns, format=tablefmt, align={ "database": "l", "table": "l", }).format() else: print 'No jobs found that match this query' sys.exit(1)
def get_files(self, s3_conn=None): # We are standardizing on UNIX semantics for file matching (vs. S3 prefix semantics). This means # we expect that on both S3 and UNIX: # bucket/1 # bucket/2 # bucket/a/1 # bucket/a/2 # # bucket/* matches just 1,2 and bucket/** matches all 4 files logger = log.get_logger('Jobs') for load_path in self.paths: if load_path.scheme == 's3': bucket = s3_conn.get_bucket(load_path.bucket) s3_globber = glob2.S3Globber(bucket) for keyname in s3_globber.glob(load_path.pattern): if not s3_globber.isdir(keyname): try: key = s3_globber.get_key(keyname) if key is not None: yield AttrDict({ 'scheme': 's3', 'name': key.name, 'etag': key.etag, 'size': key.size, 'bucket': bucket }) else: logger.warning("Key `%s` not found, skipping", keyname) except S3ResponseError as e: logger.warning("Received %s %s accessing `%s`, skipping", e.status, e.reason, keyname) elif load_path.scheme == 'file': fs_globber = glob2.Globber() for fname in fs_globber.glob(load_path.pattern): if not fs_globber.isdir(fname): yield AttrDict({ 'scheme': 'file', 'name': fname, 'etag': None, 'size': os.path.getsize(fs_globber._normalize_string(fname)), 'bucket': None }) elif load_path.scheme == 'hdfs': hdfs_host = self.spec.source.hdfs_host webhdfs_port = self.spec.source.webhdfs_port hdfs_user = self.spec.source.hdfs_user client = PyWebHdfsClient( hdfs_host, webhdfs_port, user_name=hdfs_user) hdfs_globber = glob2.HDFSGlobber(client) for fname in hdfs_globber.glob(load_path.pattern): if not hdfs_globber.isdir(fname): fileinfo = hdfs_globber.get_fileinfo(fname) yield AttrDict({ 'scheme': 'hdfs', 'name': fileinfo['path'], 'etag': fileinfo['etag'], 'size': fileinfo['length'], 'bucket': None }) else: assert False, "Unknown scheme %s" % load_path.scheme
def run(self): self.logger = log.get_logger('Load') self.process_options() self.validate_conditions() self.queue_job()
def __init__(self): self.logger = log.get_logger(self.name or 'api') self.storage = LoaderStorage()
def __init__(self, num_workers=None): self.logger = log.get_logger('WorkerPool') self.num_workers = num_workers or max(1, int(multiprocessing.cpu_count() * 0.8)) self._workers = [] self.pid = os.getpid() self._worker_lock = multiprocessing.Lock()
def __init__(self): self.logger = log.get_logger(self.name or 'api') self.storage = LoaderStorage()
def run(self): self.logger = log.get_logger('Load') self.process_options() self.validate_conditions() self.queue_job()
def build_spec(base_spec, options): # for each part in the base_spec, we expect one of two # things to be exposed in the options -> either the key name # itself or full-schema-path-to-keyname. logger = log.get_logger('Schema') return build_spec_recursive(logger, options, base_spec, get_spec_validator(), [])
def run(self): self.logger = log.get_logger('Job') self.job_api = JobApi() self.tasks_api = TasksApi() try: result = self.job_api.query({ 'job_id': self.options.job_id }) except exceptions.ApiException as e: print e.message sys.exit(1) if self.options.spec: print json.dumps(result.spec, sort_keys=True, indent=4 * ' ') else: try: finished_tasks = self.tasks_api.query({ 'job_id': self.options.job_id, 'state': 'SUCCESS' }) except exceptions.ApiException as e: print e.message sys.exit(1) files_loaded = len(finished_tasks) rows_loaded = reduce(lambda x, y: x + y.get('data', {}).get('row_count', 0), finished_tasks, 0) avg_rows_per_file = None avg_rows_per_second = None if files_loaded > 0: avg_rows_per_file = rows_loaded / files_loaded min_start_time = datetime.datetime.max max_stop_time = datetime.datetime.min for row in finished_tasks: for step in row.steps: if step['name'] == 'download': min_start_time = min(min_start_time, step['start']) max_stop_time = max(max_stop_time, step['stop']) break else: continue avg_rows_per_second = rows_loaded / (max_stop_time - min_start_time).total_seconds() result['stats'] = { k: v for k, v in { 'files_loaded': files_loaded, 'rows_loaded': rows_loaded, 'avg_rows_per_file': avg_rows_per_file, 'avg_rows_per_second': avg_rows_per_second }.iteritems() if v is not None } if result.tasks_total > 0: result['stats'].update({ 'success_rate': result.tasks_succeeded * 1.0 / result.tasks_total, 'error_rate': result.tasks_errored * 1.0 / result.tasks_total }) result["database"] = result.spec["target"]["database"] result["table"] = result.spec["target"]["table"] result = dict(result) del result['spec'] result = { k: str(v) if isinstance(v, SuperEnum.Element) else v for k, v in result.iteritems() } print json.dumps(result, sort_keys=True, indent=4 * ' ')