def validate_conditions(self): """ This happens after schema validation, and it checks the viability of the job based on "external" conditions like the existence of files, database connectivity, etc. We want to do this after we print_spec, because these checks don't have to do with the validity of the schema format.""" self.s3_conn = None for path in self.job.paths: self.validate_path_conditions(path) # validate database/table exists with pool.get_connection(database='INFORMATION_SCHEMA', **self.job.spec.connection) as conn: has_database, has_table = db_utils.validate_database_table(conn, self.job.spec.target.database, self.job.spec.target.table) if not has_database: self.logger.error("The specified database `%s` does not exist.", self.job.spec.target.database) sys.exit(1) if not has_table: self.logger.error("The specified table `%s` does not exist.", self.job.spec.target.table) sys.exit(1) file_id_col = self.job.spec.options.file_id_column with pool.get_connection(database='INFORMATION_SCHEMA', **self.job.spec.connection) as conn: if not db_utils.validate_file_id_column(conn, self.job.spec.target.database, self.job.spec.target.table, file_id_col): self.logger.error("The `file_id_column` specified (%s) must exist in the table and be of type BIGINT UNSIGNED", file_id_col) sys.exit(1)
def validate_conditions(self): """ This happens after schema validation, and it checks the viability of the job based on "external" conditions like the existence of files, database connectivity, etc. We want to do this after we print_spec, because these checks don't have to do with the validity of the schema format.""" self.s3_conn = None for path in self.job.paths: self.validate_path_conditions(path) # validate database/table exists with pool.get_connection(database='INFORMATION_SCHEMA', **self.job.spec.connection) as conn: has_database, has_table = db_utils.validate_database_table( conn, self.job.spec.target.database, self.job.spec.target.table) if not has_database: self.logger.error("The specified database `%s` does not exist.", self.job.spec.target.database) sys.exit(1) if not has_table: self.logger.error("The specified table `%s` does not exist.", self.job.spec.target.table) sys.exit(1) file_id_col = self.job.spec.options.file_id_column with pool.get_connection(database='INFORMATION_SCHEMA', **self.job.spec.connection) as conn: if not db_utils.validate_file_id_column( conn, self.job.spec.target.database, self.job.spec.target.table, file_id_col): self.logger.error( "The `file_id_column` specified (%s) must exist in the table and be of type BIGINT UNSIGNED", file_id_col) sys.exit(1) if self.options.dynamic_columns: #options.columns = [x.strip() for x in options.columns.split(",")] #TODO read in the first header line instead #header_columns = options.columns with pool.get_connection(self.job.spec.target.database, **self.job.spec.connection) as conn: self.job.spec.options.columns = db_utils.get_column_names( conn, self.job.spec.target.database, self.job.spec.target.table) if not self.job.spec.options.columns: self.logger.error("The table specified (%s) must exist", self.job.spec.target.table) sys.exit(1)
def kill_delete_query_if_exists(self, conn_args, conn_id): with pool.get_connection(database='information_schema', **conn_args) as conn: id_row = conn.query( "SELECT id FROM processlist WHERE info LIKE '%%DELETE%%' AND id=%s", conn_id) if len(id_row) > 0: db_utils.try_kill_query(conn, conn_id)
def run(self): self.jobs = Jobs() self.tasks = Tasks() task = None ignore = lambda *args, **kwargs: None signal.signal(signal.SIGINT, ignore) signal.signal(signal.SIGQUIT, ignore) try: while not self.exiting(): time.sleep(random.random() * 0.5) task = self.tasks.start() if task is None: self.worker_working.value = 0 else: self.worker_working.value = 1 job_id = task.job_id job = self.jobs.get(job_id) old_conn_id = task.data.get('conn_id', None) if old_conn_id is not None: self.kill_query_if_exists(job.spec.connection, old_conn_id) self.logger.info('Task %d: starting' % task.task_id) try: # can't use a pooled connection due to transactions staying open in the # pool on failure with pool.get_connection(database=job.spec.target.database, pooled=False, **job.spec.connection) as db_connection: db_connection.execute("BEGIN") self._process_task(task, db_connection) self.logger.info('Task %d: finished with success', task.task_id) except (RequeueTask, ConnectionException): self.logger.info('Task %d: download failed, requeueing', task.task_id) self.logger.debug("Traceback: %s" % (traceback.format_exc())) task.requeue() except TaskDoesNotExist as e: self.logger.info('Task %d: finished with error, the task was either cancelled or deleted', task.task_id) self.logger.debug("Traceback: %s" % (traceback.format_exc())) except WorkerException as e: task.error(str(e)) self.logger.info('Task %d: finished with error', task.task_id) except Exception as e: self.logger.debug("Traceback: %s" % (traceback.format_exc())) raise raise ExitingException() except ExitingException: self.logger.debug('Worker exiting') if task is not None and not task.valid(): try: task.requeue() except APSWSQLStepQueueException: pass
def kill_query_if_exists(self, conn_args, conn_id): with pool.get_connection(database='information_schema', **conn_args) as conn: id_row = conn.query("SELECT id FROM processlist WHERE info LIKE '%%LOAD DATA%%' AND id=%s", conn_id) if len(id_row) > 0: # Since this is a LOAD DATA LOCAL query, we need to kill the # connection, not the query, since LOAD DATA LOCAL queries # don't end until the file is fully read, even if they're # killed. db_utils.try_kill_connection(conn, conn_id)
def kill_query_if_exists(self, conn_args, conn_id): with pool.get_connection(database='information_schema', **conn_args) as conn: id_row = conn.query( "SELECT id FROM processlist WHERE info LIKE '%%LOAD DATA%%' AND id=%s", conn_id) if len(id_row) > 0: # Since this is a LOAD DATA LOCAL query, we need to kill the # connection, not the query, since LOAD DATA LOCAL queries # don't end until the file is fully read, even if they're # killed. db_utils.try_kill_connection(conn, conn_id)
def abort(self): with self._conn_lock: if self._active_conn_id is not None: try: with pool.get_connection(database='', **self._job.spec.connection) as conn: db_utils.try_kill_connection(conn, self._active_conn_id) except pool.PoolConnectionException: # If we couldn't connect, then its likely that we lost # connection to the database and that the query is dead # because of that anyways. pass return True return False
def abort(self): with self._conn_lock: if self._active_conn_id is not None: try: with pool.get_connection( database='', **self._job.spec.connection) as conn: db_utils.try_kill_connection(conn, self._active_conn_id) except pool.PoolConnectionException: # If we couldn't connect, then its likely that we lost # connection to the database and that the query is dead # because of that anyways. pass return True return False
def run(self): self.jobs = Jobs() self.tasks = Tasks() task = None ignore = lambda *args, **kwargs: None signal.signal(signal.SIGINT, ignore) signal.signal(signal.SIGQUIT, ignore) try: while not self.exiting(): time.sleep(random.random() * 0.5) task = self.tasks.start() if task is None: self.worker_working.value = 0 else: self.worker_working.value = 1 job_id = task.job_id job = self.jobs.get(job_id) old_conn_id = task.data.get('conn_id', None) if old_conn_id is not None: self.kill_query_if_exists(job.spec.connection, old_conn_id) self.logger.info('Task %d: starting' % task.task_id) try: # can't use a pooled connection due to transactions staying open in the # pool on failure with pool.get_connection( database=job.spec.target.database, pooled=False, **job.spec.connection) as db_connection: db_connection.execute("BEGIN") self._process_task(task, db_connection) self.logger.info('Task %d: finished with success', task.task_id) except (RequeueTask, ConnectionException): self.logger.info( 'Task %d: download failed, requeueing', task.task_id) self.logger.debug("Traceback: %s" % (traceback.format_exc())) task.requeue() except TaskDoesNotExist as e: self.logger.info( 'Task %d: finished with error, the task was either cancelled or deleted', task.task_id) self.logger.debug("Traceback: %s" % (traceback.format_exc())) except WorkerException as e: task.error(str(e)) self.logger.info('Task %d: finished with error', task.task_id) except Exception as e: self.logger.debug("Traceback: %s" % (traceback.format_exc())) raise raise ExitingException() except ExitingException: self.logger.debug('Worker exiting') if task is not None and not task.valid(): try: task.requeue() except APSWSQLStepQueueException: pass
def kill_delete_query_if_exists(self, conn_args, conn_id): with pool.get_connection(database='information_schema', **conn_args) as conn: id_row = conn.query("SELECT id FROM processlist WHERE info LIKE '%%DELETE%%' AND id=%s", conn_id) if len(id_row) > 0: db_utils.try_kill_query(conn, conn_id)