Exemplo n.º 1
0
    def run(self):
        self.logger = log.get_logger('Tasks')
        self.tasks_api = TasksApi()

        if not self.options.job_id and not self.options.last_job:
            print 'You must specify a job ID or use the --last-job option.'
            sys.exit(1)

        if self.options.last_job:
            jobs = Jobs()
            job_list = jobs.all()
            if not job_list:
                print 'No jobs found.'
                sys.exit(1)
            self.options.job_id = job_list[-1].id

        try:
            result = self.tasks_api.query({ k: v for k, v in {
                'job_id': self.options.job_id,
                'state': self.options.state,
                'order': self.options.order,
                'order_by': self.options.order_by,
                'page_size': self.options.page_size,
                'page': self.options.page,
            }.iteritems() if v })
        except exceptions.ApiException as e:
            print e.message
            sys.exit(1)

        if result:
            tablefmt = TableFormat.JSON if self.options.json else TableFormat.TABLE
            print PrettyPrinter(result, columns=TasksApi.SORTABLE_COLUMNS, format=tablefmt).format()
        else:
            print 'No tasks found that match this query'
            sys.exit(1)
Exemplo n.º 2
0
    def queue_job(self):
        all_keys = list(self.job.get_files(s3_conn=self.s3_conn))

        paths = self.job.spec.source.paths

        if self.options.dry_run:
            print "DRY RUN SUMMARY:"
            print "----------------"
            if len(all_keys) == 0:
                print "Paths %s matched no files" % ([str(p) for p in paths])
            else:
                print "List of files to load:"
                for key in all_keys:
                    print key.name
                print "Example LOAD DATA statement to execute:"
                file_id = self.job.get_file_id(all_keys[0])
                print load_data.build_example_query(self.job, file_id)
            sys.exit(0)
        elif len(all_keys) == 0:
            self.logger.warning(
                "Paths %s matched no files. Please check your path specification (be careful with relative paths)."
                % ([str(p) for p in paths]))

        self.jobs = None
        spec = self.job.spec
        try:
            self.logger.info('Creating job')
            self.jobs = Jobs()
            self.jobs.save(self.job)

            self.tasks = Tasks()

            etags = []
            for key in all_keys:
                if key.scheme in ['s3', 'hdfs']:
                    etags.append(key.etag)

            if etags and not self.options.force:
                database, table = spec.target.database, spec.target.table
                host, port = spec.connection.host, spec.connection.port
                competing_job_ids = [
                    j.id for j in self.jobs.query_target(
                        host, port, database, table)
                ]
                md5_map = self.get_current_tasks_md5_map(
                    etags, competing_job_ids)
            else:
                # For files loading on the filesystem, we are not going to MD5 files
                # for performance reasons. We are also basing this on the assumption
                # that filesystem loads are generally a one-time operation.
                md5_map = None
                if self.options.force:
                    self.logger.info(
                        'Loading all files in this job, regardless of identical files that are currently loading or were previously loaded (because of the --force flag)'
                    )
                if self.job.spec.options.file_id_column is not None:
                    self.logger.info(
                        'Since you\'re using file_id_column, duplicate records will be checked and avoided'
                    )

            count = self.submit_files(all_keys, md5_map, self.job,
                                      self.options.force)

            if count == 0:
                self.logger.info('Deleting the job, it has no child tasks')
                try:
                    self.jobs.delete(self.job)
                except:
                    self.logger.error("Rollback failed for job: %s",
                                      self.job.id)
            else:
                self.logger.info("Successfully queued job with id: %s",
                                 self.job.id)

                if not servers.is_server_running():
                    self.start_server()

                if self.options.sync:
                    self.wait_for_job()

        except (Exception, AssertionError):
            self.logger.error(
                'Failed to submit files, attempting to roll back job creation...'
            )
            exc_info = sys.exc_info()
            if self.jobs is not None:
                try:
                    self.jobs.delete(self.job)
                except:
                    self.logger.error("Rollback failed for job: %s",
                                      self.job.id)
            # Have to use this old-style raise because raise just throws
            # the last exception that occured, which could be the one in
            # the above try/except block and not the original exception.
            raise exc_info[0], exc_info[1], exc_info[2]
Exemplo n.º 3
0
    def run(self):
        self.jobs = Jobs()
        self.tasks = Tasks()
        task = None

        ignore = lambda *args, **kwargs: None
        signal.signal(signal.SIGINT, ignore)
        signal.signal(signal.SIGQUIT, ignore)

        try:
            while not self.exiting():
                time.sleep(random.random() * 0.5)
                task = self.tasks.start()

                if task is None:
                    self.worker_working.value = 0
                else:
                    self.worker_working.value = 1

                    job_id = task.job_id
                    job = self.jobs.get(job_id)

                    old_conn_id = task.data.get('conn_id', None)
                    if old_conn_id is not None:
                        self.kill_query_if_exists(job.spec.connection,
                                                  old_conn_id)

                    self.logger.info('Task %d: starting' % task.task_id)

                    try:
                        # can't use a pooled connection due to transactions staying open in the
                        # pool on failure
                        with pool.get_connection(
                                database=job.spec.target.database,
                                pooled=False,
                                **job.spec.connection) as db_connection:
                            db_connection.execute("BEGIN")
                            self._process_task(task, db_connection)
                        self.logger.info('Task %d: finished with success',
                                         task.task_id)
                    except (RequeueTask, ConnectionException):
                        self.logger.info(
                            'Task %d: download failed, requeueing',
                            task.task_id)
                        self.logger.debug("Traceback: %s" %
                                          (traceback.format_exc()))
                        task.requeue()
                    except TaskDoesNotExist as e:
                        self.logger.info(
                            'Task %d: finished with error, the task was either cancelled or deleted',
                            task.task_id)
                        self.logger.debug("Traceback: %s" %
                                          (traceback.format_exc()))
                    except WorkerException as e:
                        task.error(str(e))
                        self.logger.info('Task %d: finished with error',
                                         task.task_id)
                    except Exception as e:
                        self.logger.debug("Traceback: %s" %
                                          (traceback.format_exc()))
                        raise

            raise ExitingException()

        except ExitingException:
            self.logger.debug('Worker exiting')
            if task is not None and not task.valid():
                try:
                    task.requeue()
                except APSWSQLStepQueueException:
                    pass