Exemplo n.º 1
0
 def log_url(self, job_execution):
     job_instance = JobInstance.find_job_instance_by_id(job_execution.job_instance_id)
     project = Project.find_project_by_id(job_instance.project_id)
     for spider_service_instance in self.spider_service_instances:
         if spider_service_instance.server == job_execution.running_on:
             return spider_service_instance.log_url(project.project_name, job_instance.spider_name,
                                                    job_execution.service_job_execution_id)
Exemplo n.º 2
0
    def process_finished_jobs(self, job_status, job_execution_dict):
        found_jobs = []

        for job_execution_info in job_status[SpiderStatus.FINISHED]:
            found_jobs.append(job_execution_info['id'])

            job_execution = job_execution_dict.get(job_execution_info['id'])
            if not job_execution or job_execution.running_status == SpiderStatus.FINISHED:
                # the minimum check
                continue

            job_execution.start_time = job_execution_info['start_time']
            job_execution.end_time = job_execution_info['end_time']
            job_execution.running_status = SpiderStatus.FINISHED

            res = requests.get(self.log_url(job_execution), headers={"Range": "bytes=-4096"})
            res.encoding = 'utf8'
            match = re.findall(job_execution.RAW_STATS_REGEX, res.text, re.DOTALL)
            if not match:
                continue

            execution_results = match[0]
            job_execution.raw_stats = execution_results
            job_execution.process_raw_stats()

            job_instance = JobInstance.find_job_instance_by_id(job_execution.job_instance_id)
            spider_info = SpiderInfo.get_spider_info(job_instance.spider_name, job_instance.project_id)
            spider_info.update_spider_info(job_execution.raw_stats)

        return found_jobs
Exemplo n.º 3
0
 def cancel_spider(self, job_execution):
     job_instance = JobInstance.find_job_instance_by_id(
         job_execution.job_instance_id)
     project = Project.find_project_by_id(job_instance.project_id)
     for spider_service_instance in self.spider_service_instances:
         if spider_service_instance.server == job_execution.running_on:
             if spider_service_instance.cancel_spider(
                     project.project_name,
                     job_execution.service_job_execution_id):
                 job_execution.end_time = datetime.datetime.now()
                 job_execution.running_status = SpiderStatus.CANCELED
                 db.session.commit()
             break
Exemplo n.º 4
0
def run_spider_job(job_instance_id):
    '''
    run spider by scheduler
    :param job_instance_id:
    :return:
    '''
    try:
        job_instance = JobInstance.find_job_instance_by_id(job_instance_id)
        agent.start_spider(job_instance)
        app.logger.info(
            '[run_spider_job][project:%s][spider_name:%s][job_instance_id:%s]'
            % (job_instance.project_id, job_instance.spider_name,
               job_instance.id))
    except Exception as e:
        app.logger.error('[run_spider_job] ' + str(e))
Exemplo n.º 5
0
def run_spider_job(job_instance_id):
    '''
    run spider by scheduler
    :param job_instance_id:
    :return:
    '''
    try:
        job_instance = JobInstance.find_job_instance_by_id(job_instance_id)
        start_tasks = job_instance.start_tasks
        """start_time = datetime.now() - timedelta(minutes=30)
        count = JobExecution.query.filter_by(
            job_instance_id=job_instance_id,
            running_status=SpiderStatus.RUNNING,
        ).filter(JobExecution.start_time < start_time).count()
        if count > 0:
            return"""

        count = JobExecution.query.filter_by(
            job_instance_id=job_instance_id).filter(
                JobExecution.running_status.in_(
                    [SpiderStatus.PENDING, SpiderStatus.RUNNING])).count()
        if count >= job_instance.max_start_tasks:
            return

        slots = job_instance.max_start_tasks - count
        if job_instance.start_tasks > slots:
            start_tasks = slots

        if start_tasks > 0:
            i = 0
            while i < start_tasks:
                agent.start_spider(job_instance)
                i += 1
                app.logger.info(
                    '[run_spider_job][project:%s][spider_name:%s][job_instance_id:%s]'
                    '[start_tasks:%s][i:%s]' %
                    (job_instance.project_id, job_instance.spider_name,
                     job_instance.id, start_tasks, i))

    except Exception as e:
        app.logger.error('[run_spider_job] ' + str(e))