def put(self, project_id, spider_id): spider_instance = SpiderInstance.query.filter_by(project_id=project_id, id=spider_id).first() if not spider_instance: abort(404) job_instance = JobInstance() job_instance.spider_name = spider_instance.spider_name job_instance.project_id = project_id job_instance.spider_arguments = request.form.get('spider_arguments') job_instance.desc = request.form.get('desc') job_instance.tags = request.form.get('tags') job_instance.run_type = JobRunType.ONETIME job_instance.priority = request.form.get('priority', 0) job_instance.enabled = -1 db.session.add(job_instance) db.session.commit() agent.start_spider(job_instance) return True
def download_items(project_id, job_exec_id): format = request.args.get('format') if not format in ['json', 'csv']: abort(404) job_execution = JobExecution.query.filter_by(project_id=project_id, id=job_exec_id).first() job_instance = JobInstance.find_job_instance_by_id( job_execution.job_instance_id) project = Project.find_project_by_id(job_instance.project_id) res = requests.get(agent.items_url(job_execution)) res.encoding = 'utf8' json_data = [json.loads(s) for s in filter(None, res.text.split('\n'))] filename = '{}-{}.{}'.format(project.project_name, job_instance.spider_name, format) if format == 'json': open(os.path.join(app.static_folder, filename), 'w').write(json.dumps(json_data)) elif format == 'csv': f = open(os.path.join(app.static_folder, filename), 'w') csvwriter = csv.writer(f) count = 0 for item in json_data: if count == 0: header = item.keys() csvwriter.writerow(header) count += 1 csvwriter.writerow(item.values()) f.close() return send_from_directory(app.static_folder, filename, as_attachment=True)
def log_url(self, job_execution): job_instance = JobInstance.find_job_instance_by_id(job_execution.job_instance_id) project = Project.find_project_by_id(job_instance.project_id) for spider_service_instance in self.spider_service_instances: if spider_service_instance.server == job_execution.running_on: return spider_service_instance.log_url(project.project_name, job_instance.spider_name, job_execution.service_job_execution_id)
def log_url_slave(self, job_execution): """ 功能: 获取从爬虫的日志,只要获取一个 :param job_execution: job_execution对象 :return: 返回log的url """ job_instance = JobInstance.find_job_instance_by_id(job_execution.job_instance_id) project = Project.find_project_by_id(job_instance.project_id) # 主从爬虫运行的服务器字符串 service_job_execution_id = job_execution.service_job_execution_id.split('>') # 从爬虫服务器列表 slave_service_job_execution_id = service_job_execution_id[1].split(',') # 爬虫运行的服务器地址 running_on = job_execution.running_on.split('>') slave_running_on = running_on[1].split(',') # 调用从爬虫的日志 spider_name_slave_obj = SpiderInstance.query.filter_by( spider_name=job_instance.spider_name, project_id=job_instance.project_id).first() spider_name_slave = spider_name_slave_obj.spider_name_slave for spider_service_instance in self.spider_service_instances_slave: for job_execution_id, running_on_ in zip(slave_service_job_execution_id, slave_running_on): if spider_service_instance.server == running_on_: slave_log_url = spider_service_instance.log_url( project.project_name, spider_name_slave, job_execution_id) return slave_log_url
def create_job_execution(self, job, project_id): from SpiderKeeper.app.spider.model import JobExecution, JobInstance, JobRunType from SpiderKeeper.app import agent from SpiderKeeper.app import db execution_id = job.get('id', 0) if JobExecution.query.filter_by( service_job_execution_id=execution_id).first(): return job_instance = JobInstance() job_instance.spider_name = job.get('spider', 'unknown') job_instance.project_id = project_id job_instance.spider_arguments = '' job_instance.priority = 0 job_instance.run_type = JobRunType.ONETIME db.session.add(job_instance) db.session.commit() job_execution = JobExecution() job_execution.project_id = project_id job_execution.service_job_execution_id = execution_id job_execution.job_instance_id = 0 job_execution.create_time = self.convert_time(job, 'start_time') job_execution.end_time = self.convert_time(job, 'end_time') job_execution.running_on = agent.spider_service_instances[0].server job_execution.job_instance = job_instance job_execution.job_instance_id = job_instance.id db.session.add(job_execution) db.session.commit()
def cancel_spider(self, job_execution): job_instance = JobInstance.find_job_instance_by_id(job_execution.job_instance_id) project = Project.find_project_by_id(job_instance.project_id) # TODO multi service for spider_service_instance in self.spider_service_instances: if spider_service_instance.server == job_execution.running_on: if spider_service_instance.cancel_spider(project.project_name, job_execution.service_job_execution_id): job_execution.running_status = SpiderStatus.CANCELED db.session.commit() break
def cancel_spider(self, job_execution): job_instance = JobInstance.find_job_instance_by_id(job_execution.job_instance_id) project = Project.find_project_by_id(job_instance.project_id) for spider_service_instance in self.spider_service_instances: if spider_service_instance.server == job_execution.running_on: if spider_service_instance.cancel_spider(project.project_name, job_execution.service_job_execution_id): job_execution.end_time = datetime.datetime.now() job_execution.running_status = SpiderStatus.CANCELED db.session.commit() break
def run_spider_job(job_instance_id): ''' run spider by scheduler :param job_instance: :return: ''' try: job_instance = JobInstance.find_job_instance_by_id(job_instance_id) agent.start_spider(job_instance) app.logger.info('[run_spider_job][project:%s][spider_name:%s][job_instance_id:%s]' % ( job_instance.project_id, job_instance.spider_name, job_instance.id)) except Exception as e: app.logger.error('[run_spider_job] ' + str(e))
def run_spider_job(job_instance_id): """ 功能: 通过scrapyd启动一个爬虫 :param job_instance: :return: """ try: job_instance = JobInstance.find_job_instance_by_id(job_instance_id) agent.start_spider(job_instance) app.logger.info('[APScheduler调度器调度了一个爬虫任务] [是工程名为: %s] [下的 %s 蜘蛛]' ' [调度任务id为: %s]' % (job_instance.project_id, job_instance.spider_name, job_instance.id)) except Exception as e: app.logger.error('[APScheduler调度器运行爬虫任务出错啦!错误信息为] ' + str(e))
def cancel_spider(self, job_execution): job_instance = JobInstance.find_job_instance_by_id(job_execution.job_instance_id) project = Project.find_project_by_id(job_instance.project_id) for spider_service_instance in self.spider_service_instances: if spider_service_instance.server == job_execution.running_on: if spider_service_instance.cancel_spider(project.project_name, job_execution.service_job_execution_id): job_execution.end_time = datetime.datetime.now() job_execution.running_status = SpiderStatus.CANCELED try: db.session.commit() except: db.session.rollback() raise break
def run_once(): """ 功能: 单次运行爬虫 :param: project_id: 工程id :param: spider_name: 爬虫名称 :param: spider_arguments: 爬虫需要传入的参数 :param: priority: 任务的优先级 :param: daemon: 任务线程的类型, 是否为守护线程 :return: json.dumps({"code": 200, "status": "success/e"}), e指具体抛出的异常 """ try: # 实例化JobInstance表 job_instance = JobInstance() # 获取工程id参数 project_id = request.form.get('project_id') # 获取爬虫名称并保存 job_instance.spider_name = request.form.get('spider_name') # 保存project_id信息 job_instance.project_id = project_id # 保存爬虫的参数信息 job_instance.spider_arguments = request.form.get('spider_arguments') # 获取爬虫任务的优先级参数并保存 job_instance.priority = request.form.get('priority', 0) # 将爬虫运行类型设置一次性运行方式 job_instance.run_type = 'onetime' # 设置进程的类型 if request.form['daemon'] != 'auto': spider_args = [] if request.form['spider_arguments']: spider_args = request.form['spider_arguments'].split(",") spider_args.append("daemon={}".format(request.form['daemon'])) job_instance.spider_arguments = ','.join(spider_args) # 设置不可周期调度 job_instance.enabled = -1 # 数据库保存信息 db.session.add(job_instance) db.session.commit() # 启动爬虫实例 agent.start_spider(job_instance) return json.dumps({"code": 200, "status": "success"}) except Exception as e: return json.dumps({"code": 500, "status": "error", "msg": "运行错误"})
def clear_jobexecution(job_execution): """ clear_jobexecution check JobExecution still existed on scrapyd servers delete it if didn't existed anymore. :param job_execution: :return: """ job_instance = JobInstance.find_job_instance_by_id( job_execution.job_instance_id) project = Project.find_project_by_id(job_instance.project_id) if not check_job_existed(running_on=job_execution.running_on, project_name=project.project_name, spider_name=job_instance.spider_name, job_id=job_execution.service_job_execution_id): db.session.delete(job_execution) db.session.commit()
def job_update(project_id, job_id): project = Project.find_project_by_id(project_id) job_instance = JobInstance.find_job_instance_by_id(job_id) # app.logger.info("aaaaaaaaaaaa") # app.logger.info(job_instance) # if job_instance is None: # print(job_instance) # abort(404) job_instance.spider_name = request.form['spider_name'] job_instance.project_id = project_id job_instance.spider_arguments = request.form['spider_arguments'] job_instance.desc = request.form['desc'] job_instance.tags = request.form.get('spider_tags', "") job_instance.priority = request.form.get('priority', 0) job_instance.run_type = request.form['run_type'] # chose daemon manually if request.form['daemon'] != 'auto': spider_args = [] if request.form['spider_arguments']: spider_args = request.form['spider_arguments'].split(",") spider_args.append("daemon={}".format(request.form['daemon'])) job_instance.spider_arguments = ','.join(spider_args) if job_instance.run_type == JobRunType.ONETIME: job_instance.enabled = -1 db.session.add(job_instance) db.session.commit() agent.start_spider(job_instance) if job_instance.run_type == JobRunType.PERIODIC: job_instance.cron_minutes = request.form.get('cron_minutes') or '0' job_instance.cron_hour = request.form.get('cron_hour') or '*' job_instance.cron_day_of_month = request.form.get('cron_day_of_month') or '*' job_instance.cron_day_of_week = request.form.get('cron_day_of_week') or '*' job_instance.cron_month = request.form.get('cron_month') or '*' # set cron exp manually if request.form.get('cron_exp'): job_instance.cron_minutes, job_instance.cron_hour, job_instance.cron_day_of_month, job_instance.cron_day_of_week, job_instance.cron_month = \ request.form['cron_exp'].split(' ') db.session.add(job_instance) db.session.commit() return redirect(request.referrer, code=302)
def log_url_master(self, job_execution): """ 功能: 获取主爬虫的日志 :param job_execution: job_execution对象 :return: 返回log的url """ job_instance = JobInstance.find_job_instance_by_id(job_execution.job_instance_id) project = Project.find_project_by_id(job_instance.project_id) # 主从爬虫运行的服务器字符串 service_job_execution_id = job_execution.service_job_execution_id.split('>') # 主爬虫服务器 master_service_job_execution_id = service_job_execution_id[0] # 爬虫运行的服务器地址 running_on = job_execution.running_on.split('>') master_running_on = running_on[0] # 调用主爬虫的日志 for spider_service_instance in self.spider_service_instances_master: if spider_service_instance.server == master_running_on: master_log_url = spider_service_instance.log_url( project.project_name, job_instance.spider_name, master_service_job_execution_id) return master_log_url
def job_add(project_id): project = Project.find_project_by_id(project_id) job_instance = JobInstance() job_instance.spider_name = request.form['spider_name'] job_instance.project_id = project_id job_instance.spider_arguments = request.form['spider_arguments'] job_instance.priority = request.form.get('priority', 0) job_instance.run_type = request.form['run_type'] if job_instance.run_type == JobRunType.ONETIME: job_instance.enabled = -1 db.session.add(job_instance) db.session.commit() agent.start_spider(job_instance) if job_instance.run_type == JobRunType.PERIODIC: job_instance.cron_minutes = request.form.get('cron_minutes') or '0' job_instance.cron_hour = request.form.get('cron_hour') or '*' job_instance.cron_day_of_month = request.form.get( 'cron_day_of_month') or '*' job_instance.cron_day_of_week = request.form.get( 'cron_day_of_week') or '*' job_instance.cron_month = request.form.get('cron_month') or '*' db.session.add(job_instance) db.session.commit() return redirect(request.referrer, code=302)
def job_add(project_id): # Save the upload file, and save the file path to the # job_instance.spider_arguments dst = '' if 'file' in request.files: file = request.files['file'] # if user does not select file, browser also # submit a empty part without filename if file.filename == '': pass if file and allowed_seed(file.filename): filename = secure_filename(file.filename) dst = os.path.join( app.config['UPLOAD_DIR'], datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S-") + filename) file.save(dst) project = Project.find_project_by_id(project_id) job_instance = JobInstance() job_instance.spider_name = request.form['spider_name'] job_instance.project_id = project_id job_instance.spider_arguments = request.form['spider_arguments'] if dst: if job_instance.spider_arguments: job_instance.spider_arguments += (",seed={}".format(dst)) else: job_instance.spider_arguments = "seed={}".format(dst) job_instance.priority = request.form.get('priority', 0) job_instance.run_type = request.form['run_type'] # chose daemon manually if request.form['daemon'] != 'auto': spider_args = [] if request.form['spider_arguments']: spider_args = request.form['spider_arguments'].split(",") spider_args.append("daemon={}".format(request.form['daemon'])) job_instance.spider_arguments = ','.join(spider_args) if job_instance.run_type == JobRunType.ONETIME: job_instance.enabled = -1 db.session.add(job_instance) db.session.commit() agent.start_spider(job_instance) if job_instance.run_type == JobRunType.PERIODIC: job_instance.cron_minutes = request.form.get('cron_minutes') or '0' job_instance.cron_hour = request.form.get('cron_hour') or '*' job_instance.cron_day_of_month = request.form.get( 'cron_day_of_month') or '*' job_instance.cron_day_of_week = request.form.get( 'cron_day_of_week') or '*' job_instance.cron_month = request.form.get('cron_month') or '*' # set cron exp manually if request.form.get('cron_exp'): job_instance.cron_minutes, job_instance.cron_hour, job_instance.cron_day_of_month, job_instance.cron_day_of_week, job_instance.cron_month = \ request.form['cron_exp'].split(' ') db.session.add(job_instance) db.session.commit() return redirect(request.referrer, code=302)
def post(self, project_id): post_data = request.form if post_data: job_instance = JobInstance() job_instance.spider_name = post_data['spider_name'] job_instance.project_id = project_id job_instance.spider_arguments = post_data.get('spider_arguments') job_instance.desc = post_data.get('desc') job_instance.tags = post_data.get('tags') job_instance.run_type = post_data['run_type'] job_instance.priority = post_data.get('priority', 0) if job_instance.run_type == 'onetime': print(job_instance.run_type) print('-+' * 100) job_instance.enabled = -1 db.session.add(job_instance) for i in range(3): try: db.session.commit() break except: continue agent.start_spider(job_instance) if job_instance.run_type == "periodic": job_instance.cron_minutes = post_data.get( 'cron_minutes') or '0' job_instance.cron_hour = post_data.get('cron_hour') or '*' job_instance.cron_day_of_month = post_data.get( 'cron_day_of_month') or '*' job_instance.cron_day_of_week = post_data.get( 'cron_day_of_week') or '*' job_instance.cron_month = post_data.get('cron_month') or '*' db.session.add(job_instance) db.session.commit() return True
def job_add(project_id): project = Project.find_project_by_id(project_id) job_instance = JobInstance() job_instance.spider_name = request.form['spider_name'] job_instance.project_id = project_id job_instance.export_folder = 'export_' + datetime.datetime.today( ).strftime('%Y.%m.%d_%H.%M.%S') job_instance.spider_arguments = request.form['spider_arguments'] if job_instance.spider_arguments is not None and job_instance.spider_arguments != '': job_instance.spider_arguments += ',' job_instance.spider_arguments += 'export_folder=' + job_instance.export_folder job_instance.priority = request.form.get('priority', 0) job_instance.run_type = request.form['run_type'] # chose daemon manually if request.form['daemon'] != 'auto': spider_args = [] if request.form['spider_arguments']: spider_args = request.form['spider_arguments'].split(",") spider_args.append("daemon={}".format(request.form['daemon'])) job_instance.spider_arguments = ','.join(spider_args) if job_instance.run_type == JobRunType.ONETIME: job_instance.enabled = -1 db.session.add(job_instance) db.session.commit() agent.start_spider(job_instance) if job_instance.run_type == JobRunType.PERIODIC: job_instance.cron_minutes = request.form.get('cron_minutes') or '0' job_instance.cron_hour = request.form.get('cron_hour') or '*' job_instance.cron_day_of_month = request.form.get( 'cron_day_of_month') or '*' job_instance.cron_day_of_week = request.form.get( 'cron_day_of_week') or '*' job_instance.cron_month = request.form.get('cron_month') or '*' # set cron exp manually if request.form.get('cron_exp'): job_instance.cron_minutes, job_instance.cron_hour, job_instance.cron_day_of_month, job_instance.cron_day_of_week, job_instance.cron_month = \ request.form['cron_exp'].split(' ') db.session.add(job_instance) db.session.commit() return redirect(request.referrer, code=302)
def job_add(project_id): project = Project.find_project_by_id(project_id) job_instance = JobInstance() job_instance.spider_name = request.form['spider_name'] job_instance.project_id = project_id job_instance.spider_arguments = request.form['spider_arguments'] job_instance.priority = request.form.get('priority', 0) job_instance.run_type = request.form['run_type'] # chose daemon manually if request.form['daemon'] != 'auto': spider_args = [] if request.form['spider_arguments']: spider_args = request.form['spider_arguments'].split(",") spider_args.append("daemon={}".format(request.form['daemon'])) job_instance.spider_arguments = ','.join(spider_args) if job_instance.run_type == JobRunType.ONETIME: job_instance.enabled = -1 try: db.session.add(job_instance) db.session.commit() except: db.session.rollback() raise agent.start_spider(job_instance) if job_instance.run_type == JobRunType.PERIODIC: job_instance.cron_minutes = request.form.get('cron_minutes') or '0' job_instance.cron_hour = request.form.get('cron_hour') or '*' job_instance.cron_day_of_month = request.form.get( 'cron_day_of_month') or '*' job_instance.cron_day_of_week = request.form.get( 'cron_day_of_week') or '*' job_instance.cron_month = request.form.get('cron_month') or '*' # set cron exp manually if request.form.get('cron_exp'): job_instance.cron_minutes, job_instance.cron_hour, job_instance.cron_day_of_month, job_instance.cron_day_of_week, job_instance.cron_month = \ request.form['cron_exp'].split(' ') try: db.session.add(job_instance) db.session.commit() except: db.session.rollback() raise return redirect(request.referrer, code=302)
def add_scheduler(): """ 功能: 给爬虫添加周期调度实例, 添加成功后数据库同步 :param: project_id: 工程id :param: spider_name: 爬虫名称 :param: spider_arguments: 爬虫需要传入的参数 :param: priority: 任务的优先级 :param: daemon: 任务线程的类型, 是否为守护线程 :param: cron_minutes: 调度周期参数-分钟 :param: cron_hour: 调度周期参数-小时 :param: cron_day_of_month: 调度周期参数-每月的天 :param: cron_day_of_week: 调度周期参数-每周的星期 :return: json.dumps({"code": 200, "status": "success/e"}), e指具体抛出的异常 """ try: project_id = request.form.get('project_id') job_instance = JobInstance() job_instance.spider_name = request.form['spider_name'] job_instance.project_id = project_id job_instance.spider_arguments = request.form['spider_arguments'] job_instance.priority = request.form.get('priority', 0) job_instance.run_type = 'periodic' # chose daemon manually if request.form['daemon'] != 'auto': spider_args = [] if request.form['spider_arguments']: spider_args = request.form['spider_arguments'].split(",") spider_args.append("daemon={}".format(request.form['daemon'])) job_instance.spider_arguments = ','.join(spider_args) job_instance.cron_minutes = request.form.get('cron_minutes') or '0' job_instance.cron_hour = request.form.get('cron_hour') or '*' job_instance.cron_day_of_month = request.form.get( 'cron_day_of_month') or '*' job_instance.cron_day_of_week = request.form.get( 'cron_day_of_week') or '*' job_instance.cron_month = request.form.get('cron_month') or '*' if request.form.get('cron_exp'): job_instance.cron_minutes, job_instance.cron_hour, job_instance.cron_day_of_month, job_instance.cron_day_of_week, job_instance.cron_month = \ request.form['cron_exp'].split(' ') db.session.add(job_instance) db.session.commit() return json.dumps({"code": 200, "status": "success"}) except Exception as e: return json.dumps({"code": 500, "status": "error", "msg": "运行错误"})
def job_get(project_id, job_id): # JobInstance.query.filter_by(project_id=project_id, id=job_id).first().to_dict() job_instance = JobInstance.find_job_instance_by_id(job_id).to_dict() # app.logger.info(job_instance) return json.dumps(job_instance)
def post(self, project_id): post_data = request.form if post_data: job_instance = JobInstance() job_instance.spider_name = post_data['spider_name'] job_instance.project_id = project_id job_instance.spider_arguments = post_data.get('spider_arguments') job_instance.desc = post_data.get('desc') job_instance.tags = post_data.get('tags') job_instance.run_type = post_data['run_type'] job_instance.priority = post_data.get('priority', 0) if job_instance.run_type == "periodic": job_instance.cron_minutes = post_data.get( 'cron_minutes') or '0' job_instance.cron_hour = post_data.get('cron_hour') or '*' job_instance.cron_day_of_month = post_data.get( 'cron_day_of_month') or '*' job_instance.cron_day_of_week = post_data.get( 'cron_day_of_week') or '*' job_instance.cron_month = post_data.get('cron_month') or '*' db.session.add(job_instance) db.session.commit() return True
def job_add(project_id): project = Project.find_project_by_id(project_id) job_instance = JobInstance() job_instance.spider_name = request.form['spider_name'] job_instance.project_id = project_id job_instance.spider_arguments = request.form['spider_arguments'] job_instance.priority = request.form.get('priority', 0) job_instance.run_type = request.form['run_type'] spider_url = request.form['spider_url'] spider_models = request.form['spider_models'] if job_instance.spider_name == "news": allowed_domains = spider_url.split('/')[2] a = "allowed_domains=" + allowed_domains a = a + "," + "model=" + spider_models job_instance.spider_arguments = a r = CRedis() r.lpush(allowed_domains + ':start_urls', spider_url) elif job_instance.spider_name == 'jd': r = CRedis() r.lpush('jd:start_urls', spider_url) if job_instance.run_type == JobRunType.ONETIME: job_instance.enabled = -1 db.session.add(job_instance) db.session.commit() agent.start_spider(job_instance) if job_instance.run_type == JobRunType.PERIODIC: job_instance.cron_minutes = request.form.get('cron_minutes') or '0' job_instance.cron_hour = request.form.get('cron_hour') or '*' job_instance.cron_day_of_month = request.form.get( 'cron_day_of_month') or '*' job_instance.cron_day_of_week = request.form.get( 'cron_day_of_week') or '*' job_instance.cron_month = request.form.get('cron_month') or '*' db.session.add(job_instance) db.session.commit() return redirect(request.referrer, code=302)