def count_jobs_by_account_id(cls, account_id, status='running'): if status: return db_session.query(Job).filter(Job.account == account_id, Job.status == status).count() else: return db_session.query(Job).filter( Job.account == account_id).count()
def select_by_parameter(cls, parameter): page = int(parameter['page']) limit = int(parameter['limit']) status = int(parameter['status']) keyword = str(parameter['keyword']) sort = int(parameter['sort']) try: datas = db_session.query(MainUrl).filter( MainUrl.sort == sort, MainUrl.status == status, MainUrl.webSite.like( "%{}%".format(keyword))).limit(limit).offset( (page - 1) * limit) count = db_session.query(MainUrl).filter( MainUrl.sort == sort, MainUrl.status == status, MainUrl.webSite.like("%{}%".format(keyword))).count() db_session.close() return { "code": "200", "message": "succeed", "data": [item.single_to_dict() for item in datas], "count": count } except (SqlalchemyIntegrityError, PymysqlIntegrityError, InvalidRequestError): db_session.close() return {"code": "404", "message": "fialed", "data": [], "count": 0}
def update_task_name(cls, parameter): spider_name = int(parameter['task_id']) main_url_pids = parameter['main_url_pids'] operation = str(parameter['operation']) if main_url_pids != "": try: if operation == "import": for main_url_pid in eval(main_url_pids): main_url = db_session.query(MainUrl).filter( MainUrl.pid == main_url_pid).first() main_url.spider_name = spider_name elif operation == "remove": for main_url_pid in eval(main_url_pids): main_url = db_session.query(MainUrl).filter( MainUrl.pid == main_url_pid).first() main_url.spider_name = 0 db_session.commit() db_session.close() return {"code": "200", "message": "更新成功"} except (SqlalchemyIntegrityError, PymysqlIntegrityError, InvalidRequestError): db_session.close() return {"code": "404", "message": "更新失败"} else: return {"code": "202", "message": "并没有移除数据"}
def select_all(cls, parameter): page = int(parameter['page']) limit = int(parameter['limit']) sort = int(parameter['sort']) try: datas = db_session.query(MainUrl).filter( MainUrl.sort == sort, MainUrl.spider_name == 0, MainUrl.status == 1).limit(limit).offset((page - 1) * limit) count = db_session.query(MainUrl).filter( MainUrl.sort == sort, MainUrl.spider_name == 0, MainUrl.status == 1).count() db_session.close() return { "code": "200", "message": "succeed", "data": [item.single_to_dict() for item in datas], "count": count } except (SqlalchemyIntegrityError, PymysqlIntegrityError, InvalidRequestError): db_session.close() return {"code": "404", "message": "fialed", "data": [], "count": 0}
def get_all_need_restart_task(cls): """ 主要用于服务器宕机后重新启动时获取所有需要启动的任务,包括pending状态和running状态的 :return: """ return db_session.query(Task.id, Task.status).filter( Task.status.notin_(('succeed', 'failed', 'cancelled'))).all()
def set_job_by_track_ids(cls, track_ids, values): jobs = db_session.query(Job).filter(Job.track_id.in_(track_ids)).all() track_ids_copy = track_ids.copy() try: for job in jobs: track_ids.remove(job.track_id) value = values.get(job.track_id, {}) new_status = value.get('status') new_result = value.get('result', '') new_traceback = value.get('traceback', '') if job.status != new_status: # 第一次变成running的时间即启动时间 if new_status == 'running': job.start_time = datetime.datetime.now() if new_status in ['succeed', 'failed']: job.end_time = datetime.datetime.now() job.result = new_result job.traceback = new_traceback job.status = new_status db_session.commit() except: logger.exception('set_job_by_track_ids catch exception.') db_session.rollback() return track_ids_copy return track_ids
def get_all_need_check_task(cls, last_time): """ 获取所有需要检查的任务(即状态可能被用户修改的任务) :return: """ return db_session.query(Task.id, Task.status, Task.last_update)\ .filter(and_(Task.status.in_(('pausing', 'running', 'cancelled')), Task.last_update >= last_time)).all()
def set_job_result(cls, job_id, result): job = db_session.query(Job).filter(Job.id == job_id).first() if job: job.result = result db_session.commit() return True return False
def set_aps_status(cls, aps_id, status): tag = db_session.query(TaskAccountGroup).filter( TaskAccountGroup.aps_id == aps_id).first() if tag: tag.status = status db_session.commit() return True return False
def set_task_result(cls, task_id, result): task = db_session.query(Task).filter(Task.id == task_id).first() if task: task.result = result task.last_update = datetime.datetime.now() db_session.commit() return True return False
def start_task(cls, parameter): spider_name = int(parameter['id']) datas = db_session.query(MainUrl).filter( MainUrl.spider_name == spider_name, MainUrl.status == 1).all() db_session.close() parameters = [] for item in [item.single_to_dict() for item in datas]: parameter = {} url = item.get("address") try: rule = item["rule"] if rule == None or rule == "null" or rule == "": crawler_info.info( "{} : has no filtering rules, default algorithm acquisition" .format(url)) parameter["rule"] = { 'filter_rule': '', 'selector': 'xpath', 'deep_limit': '1', 'fields': { 'title': '', 'author': '', 'publishTime': '', 'content': '' } } else: filter_rule = json.loads(rule)["filter_rule"] if filter_rule and filter_rule != "": rule = json.loads(item["rule"].replace("@", "+")) parameter["rule"] = rule else: parameter["rule"] = rule except: crawler_info.info( "{} : has no filtering rules, default algorithm acquisition" .format(url)) parameter["rule"] = { 'filter_rule': '', 'selector': 'xpath', 'deep_limit': '1', 'fields': { 'title': '', 'author': '', 'publishTime': '', 'content': '' } } parameter["pid"] = item.get("pid") parameter["webSite"] = item.get("webSite") parameter["url"] = str(url).strip() parameters.append(parameter) return parameters
def get_account_tasks(cls, account_id): """ 查询该账号关联的所有任务 :param account_id: :return: 返回所有关联的task id """ tags = db_session.query(TaskAccountGroup).filter( TaskAccountGroup.account_id == account_id).all() task_ids = [] for t in tags: task_ids.append(t.task_id) return task_ids
def set_job_status(cls, job_id, status): job = db_session.query(Job).filter(Job.id == job_id).first() if job: if job.status != status: # 第一次变成running的时间即启动时间 if status == 'running': job.start_time = datetime.datetime.now() if status in ['success', 'failure']: job.end_time = datetime.datetime.now() job.status = status db_session.commit() return True return False
def set_job_by_track_id(cls, track_id, status, result='', traceback=''): job = db_session.query(Job).filter(Job.track_id == track_id).first() if job: if job.status != status: # 第一次变成running的时间即启动时间 if status == 'running': job.start_time = datetime.datetime.now() if status in ['succeed', 'failed']: job.end_time = datetime.datetime.now() job.result = result job.traceback = traceback job.status = status db_session.commit() return True return False
def update_mainurl(cls, parameter): pid = parameter['pid'] mainurl = db_session.query(MainUrl).filter(MainUrl.pid == pid).first() try: remark = parameter['remark'] mainurl.remark = remark db_session.commit() db_session.close() except: pass try: status = parameter['status'] mainurl.status = status db_session.commit() db_session.close() except: pass try: rule = parameter['rule'] mainurl.rule = rule db_session.commit() db_session.close() except: pass
def get_all_accounts(cls): return db_session.query(Account).all()
def get_jobs_by_task_id(cls, task_id): return db_session.query(Job.status).filter(Job.task == task_id).all()
def add_account_using_counts(cls, account_id): acc = db_session.query(Account).filter( Account.id == account_id).first() if acc: acc.using += 1
def get_all_pausing_task(cls): return db_session.query(Task).filter(Task.status == 'pausing').all()
def get_all_processor(cls): res = db_session.query( TaskCategory.processor).filter().distinct().all() return [r[0] for r in res]
def get_aps_ids_by_task_id(cls, task_id): aps_id = db_session.query( Task.aps_id).filter(Task.id == task_id).first() if aps_id: return aps_id[0] return ''
def get_task_status_apsid(cls, task_id): return db_session.query( Task.status, Task.aps_id).filter(Task.id == task_id).first()
def get_task_by_task_id(cls, task_id): return db_session.query(Task).filter(Task.id == task_id).first()
def get_all_failed_task(cls): return db_session.query().filter(Task.status == 'failed').all()
def get_all_tasks(cls): return db_session.query(Task).all()
def get_account(cls, account_id): return db_session.query(Account).filter( Account.id == account_id).first()
def delete_one(cls, parameter): maininfo = db_session.query(MainUrl).filter( MainUrl.pid == parameter["pid"]).first() db_session.delete(maininfo) db_session.commit() db_session.close()
def set_aps_status_by_task(cls, task_id, status): tags = db_session.query(TaskAccountGroup).filter( TaskAccountGroup.task_id == task_id).all() for tag in tags: tag.status = status
def get_scheduler(cls, scheduler_id): return db_session.query( Scheduler.mode, Scheduler.interval, Scheduler.start_date, Scheduler.end_date).filter(Scheduler.id == scheduler_id).first()
def get_all_new_task(cls): return db_session.query( Task.id, Task.status).filter(Task.status == 'new').all()