def _update_to_db(self, task: Task, targetclient: Client, succ: bool): """同步任务下发状态到数据库。这里来的永远是子任务""" try: if not succ: # 失败,返回回馈数据 task.cmdstatus = ECommandStatus.Failed task.cmdrcvmsg = "发送任务到采集端失败" tb: TaskBatchBack = TaskBatchBack.create_from_task( task, cmdrcvmsg=task.cmdrcvmsg) if not OutputManagement.output(tb): self._logger.error( "Write idown_task_back failed:\ntaskid:{}".format( task.taskid)) if isinstance(task.cmd, IdownCmd): task.cmd.cmdstatus = ECommandStatus.Failed task.cmd.cmdrcvmsg = task.cmdrcvmsg cmdback: CmdFeedBack = CmdFeedBack.create_from_cmd( task.cmd, task.cmd.cmdstatus, task.cmd.cmdrcvmsg) if not OutputManagement.output(cmdback): self._logger.error( "Write IDownCmdBack failed:\nplatform:{}\ncmdid:{}" .format(task._platform, task.cmd_id)) else: # 成功 task.cmdstatus = ECommandStatus.Dealing if isinstance(task.cmd, IdownCmd): task.cmd.cmdstatus = ECommandStatus.Dealing self._logger.info( "Send task succeed\t{}:\ntaskid:{}\nbatchid:{}\nclient:{}\t{}" .format(task.tasktype.name, task.taskid, task.batchid, task._clientid, targetclient._statusbasic.ip)) # 更新数据库当前子任务状态 if not self._dbmanager.update_batchtask_status( task._platform, task.taskid, task.batchid, task.cmdstatus): # 这里如果更新数据库状态失败了,那发送线程可能会重新发送当前子任务。 # 此时需要采集端支持 冥等性(任务去重,保证多次收到同一条任务只会处理一条) self._logger.error( "Update task cmdstatus failed:\ntaskid:{}\ncmdstatus:{}". format(task.taskid, task.cmdstatus.name)) # 如果带cmd,更新cmd状态 elif isinstance( task.cmd, IdownCmd) and not self._dbmanager.update_cmd_status( task.cmd.platform, task.cmd.cmd_id, targetclient._statusbasic._clientid, task.cmd.cmdstatus): self._logger.error( "Update cmd cmdstatus failed:\ntaskid:{}\ncmdstatus:{}". format(task.taskid, task.cmdstatus.name)) except Exception: self._logger.error("Update task cmdstatus to db error: {}".format( traceback.format_exc())) self._dbmanager.update_idown_task_status(task, ECommandStatus.Failed) if isinstance(task.cmd, IdownCmd): self._dbmanager.update_cmd_status(task._platform, task.cmd_id, task.cmd._clientid, ECommandStatus.Failed)
def _execute_check_registration(self, spiders: list, tsk: Task): """ 批量任务直接执行并返回回馈文件 :param tsk: :return: """ try: for appcfg in spiders: # 获取的结果为定义的结构体 try: spider: SpiderBase = appcfg._appclass(tsk, self._get_appcfg(appcfg), self._clientid) spider.check_registration() except Exception: self._write_tgback(tsk, ECommandStatus.Failed, "查询账号是否注册失败") self._logger.error("Registration check err: {}".format( traceback.format_exc())) continue # 全平台的账号查询完成后结束任务 self._logger.info("Task {} execute complete.".format(tsk.batchid)) except Exception: self._logger.error(f"Execute task error:\nbatchid:{tsk.batchid}\nerror:{traceback.format_exc()}") self._write_tgback(tsk, ECommandStatus.Failed, "执行任务出现不可知错误") finally: with self._spider_threads_locker: if self._spider_dealing_dict.__contains__(tsk): self._spider_dealing_dict.pop(tsk, None) if tsk is not None: if callable(tsk.on_complete): tsk.on_complete(tsk)
def login_and_download_data(self, tsk: Task): """ 外界程序入口 下载任务要存入本地sql,用来重复下载更新数据,或者是存储登陆令牌 update by swm 191219 1、webmail需要去验证账密或者cookie是否有效,有效才保存在数据库 2、pop3和imap需要去验证apptype是否在mailserverl里面 :param tsk: :return: """ dlprotocal = tsk.cmd.stratagymail.eml_priority_protocol token = tsk.tokentype # 因为是分表存的,先存入cmd,防止查询的时候没有将cmd查询出来而导致错误 if tsk.cmd_id is not None: try: self._sqlfunc.store_task_cmd(tsk.cmd_id, tsk.cmd.cmd_str) CmdProcess.write_cmd_back(tsk.cmd, ECommandStatus.Succeed, "任务设置应用成功") except: self._logger.error( f"Store task cmd error, err:{traceback.format_exc()}") CmdProcess.write_cmd_back(tsk.cmd, ECommandStatus.Failed, "任务设置应用失败") if token == ETokenType.Sms or token == ETokenType.SmsPwd: # 插入前同步下数据库的登陆凭证 self._synchronize_login_cred(tsk) self._store_sms_task(tsk) # elif token == ETokenType.Pwd or token == ETokenType.Cookie: # # 检测账号是否可用然后将任务保存到数据库 # self._checklogin(tsk) # else: # self._logger.error("Tokentype is unknown, please check what the tokentype get!") # self._write_tgback(tsk, ECommandStatus.Failed, "登陆下载数据,tokentype的值为空") # if callable(tsk.on_complete): # tsk.on_complete(tsk) # webmail elif dlprotocal == "webmail": # 现在的token只有pwd和cookie两种了,以前的东西都废弃了 self._checklogin(tsk) else: # 然后就是pop3,imap需要去验证下数据库有没有保存相关的邮服地址 # 因为pop和imap的插件继承了spiderbase,所以需要封装一个appcfg # 1、验证账号是否有效,是否能登陆pop或者imap # 2、将账号存入数据库 self._sqlfunc.insert_task_to_sqlit(tsk) self._logger.info( f"Get a pop3/imap task\ntaskid:{tsk.taskid}\nbatchid:{tsk.batchid}" ) self._write_tgback(tsk, ECommandStatus.Dealing, "pop3/imap下载任务已加入下载队列") # 3、结束任务 if callable(tsk.on_complete): tsk.on_complete(tsk) # 好了这个就暂时这样 # 这里如果任务有cmdid那么的话要保存设置,并且要给回馈,这里是处理带有设置的task # 这里为什么可以在后面存,是因为那边还需要验证cookie或者是账号是否有效 # 而且那边的存储是多线程的不会造成阻塞,这样cmd是优先存入了数据库的 return
def login_batch_test(self, tsk: Task): # 待处理 # 1、解析账密文件 # 2、批量使用账密测试网站 # 3、生成回馈文件 print("to do") tsk.on_complete(tsk) pass
def _merge_task_status(self, task: Task): """合并处理子任务状态""" try: # 当前模块为 任务发送管理器,只负责 等待发送的任务及其状态管理。 # 合并子任务状态,并将总任务状态从 等待发送更新为正在执行。 # 只要有一个子任务发送成功,则总任务更新为发送成功 # 若全部子任务发送失败,整个任务才算发送失败, # 貌似这样才能在非实时性的任务状态中没什么错 ## 先看是否还有尚未发送的子任务,有的话先不要乱动。。 waitforsendcount: int = self._dbmanager.get_batch_task_count_by_cmdstatus( task, ECommandStatus.WaitForSend) if waitforsendcount > 0: return ## 再看有没有发送成功的,有的话直接总任务发送成功,正在执行 sendsucccount: int = self._dbmanager.get_batch_task_count_by_cmdstatus( task, ECommandStatus.Dealing) if sendsucccount > 0: task.cmdstatus = ECommandStatus.Dealing self._logger.info( "Task all sent, taskid={} tasktype={}".format( task.taskid, task.tasktype)) # 只要有一个子任务发送成功,则更新总任务为正在执行(不需要返回回馈数据) if not self._dbmanager.update_idown_task_status( task, ECommandStatus.Dealing): self._logger.error( "Update task cmdstatus to {} faled: taskid:{}".format( ECommandStatus.Dealing.name, task.taskid)) else: task.cmdstatus = ECommandStatus.Failed self._logger.error( "Task all sent failed, taskid={} tasktype={}".format( task.taskid, task.tasktype)) # 若全部子任务都已发送(不管发成功还是失败),且子任务没有发送成功的, # 则更新总任务为失败,并返回回馈数据 if not self._dbmanager.update_idown_task_status( task, ECommandStatus.Failed): self._logger.error( "Update task cmdstatus to {} faled: taskid:{}".format( ECommandStatus.Failed.name, task.taskid)) # 失败,返回回馈数据 tb: TaskBack = TaskBack(task, ECommandStatus.Failed, "任务执行失败,发送到采集端失败") if not OutputManagement.output(tb): self._logger.error( "Write idown_task_back failed:\ntaskid:{}".format( task.taskid)) except Exception as ex: self._logger.error( "Merge IDownTask status error:\nplatform:{}\ntaskid:{}\nerror:{}" .format(task._platformm, task.taskid, ex.args)) self._dbmanager.update_idown_task_status(task, ECommandStatus.Failed)
def store_input(self, tsk: Task): # 存储验证码 try: self._write_tgback(tsk, ECommandStatus.Succeed, "任务处理成功,验证码已提取") self._sqlfunc.input_insert(tsk) except Exception as err: self._logger.error("Store input error:{}".format(err)) self._write_tgback(tsk, ECommandStatus.Succeed, "执行任务出现不可知错误") finally: if callable(tsk.on_complete): tsk.on_complete(tsk)
def _synchronize_login_cred(self, tsk: Task): """ 这个方法目前只用于同步短信登陆的登陆凭证, 后面应该是要用来同步,短信,账密等类型的登陆凭证 :param tsk: :return: """ # 这个有待改进,需要查询有效的cookie # 同步短信登陆的登陆凭证 # 2019/03/21修改phone和account任选一个,下载状态为成功或者登陆成功,确保cookie为能用的 # 可能这个同步还需要继续修改,目前可以这样 sql = """ select * from task where apptype=? and phone=? and (taskstatus=6 or taskstatus=3) """ pars = ( tsk.apptype, tsk.phone, ) res_info = self._sqlfunc.query_task_by_sql(sql, pars) if len(res_info) == 0: return # 取最新的一个 res_one = res_info[-1] # 取出来的数据有cookie,并且目前的数据没有cookie就通过cookie赋值 if res_one.get("cookie") is not None and tsk.cookie is None: tsk.cookie = res_one.get("cookie") return
def _distribute_tasktype(self, tsk: Task): """ 分配任务到spider :param tsk: :return: """ # 存入数据库之前加载一次加载一次cmd if tsk.cmd_id is None: tsk.cmd = self.defcmd else: # 这里是属于预处理的东西不能依靠前端发的字段去补齐,所以还是自己去补齐 tsk.cmd.fill_defcmd(self.defcmd) self._logger.info(f"Task start to processing, apptype={tsk.apptype}") if tsk.tasktype == ETaskType.LoginOnly: self._login_only.login_only(tsk) elif tsk.tasktype == ETaskType.LoginDownload: # 登陆并下载的数据会存入数据库执行下载策略 # 现在都是进这个方法,那么就需要增加两步验证 self._download_task_store.login_and_download_data(tsk) elif tsk.tasktype == ETaskType.CheckOnline: self._online_check.online_check(tsk) elif tsk.tasktype == ETaskType.CheckRegistration: self._register_check.check_registration(tsk) elif tsk.tasktype == ETaskType.LoginTest: self._batch_login_test.login_batch_test(tsk) elif tsk.tasktype == ETaskType.Input: # 验证码直接存入数据库,验证码会等待15分钟 self._store_vercode.store_input(tsk) elif tsk.tasktype == ETaskType.Logout: self._logout.logout(tsk) else: raise Exception("Unknown tasktype!") return
def _store_sms_task(self, tsk: Task): """ 有关短信下载的任务,存入数据库 :param tsk: :return: """ try: if tsk.phone is None or tsk.phone == "": self._logger.error("Sms donwload phone cannot be none!") self._write_tgback(tsk, ECommandStatus.Failed, "phone不能为空") return self._logger.info("New sms task: {}".format(tsk.batchid)) self._write_tgback(tsk, ECommandStatus.Dealing, "已将短信下载任务加入处理队列") self._sqlfunc.insert_task_to_sqlit(tsk) except Exception as err: self._logger.error("Store sms error: {}".format(err)) finally: if callable(tsk.on_complete): tsk.on_complete(tsk)
def _convert(self, data: InputData) -> iter: """将中心下发的任务转换为自有的通用任务结构Task体枚举(一个文件可能有多个任务段)""" succ = True try: if data.stream is None or not data.stream.readable(): self._logger.error( "Data stream is None when trying to convert to standard Task: %s" % data._source) succ = False return for seg in DataParser.parse_standard_data(data.stream): if seg is None or len(seg._fields) < 1: continue try: # 必要字段 self._add_required_fields(seg, data) # 根据host拿apptype if not seg.contains_key("apptype"): apptype = self._get_apptype(seg._fields, data) if not apptype is None: seg.append_to_fields('apptype', apptype) # 验证字段有效性 if not self._validation_fields(seg, data): succ = False continue tsk: Task = Task(seg._fields) tsk.segindex = seg.segindex tsk.segline = seg.segline if tsk is None: continue yield tsk except Exception: succ = False self._logger.error( "Generate Task from dic fields error:\ndata:%s\nex:%s" % (data._source, traceback.format_exc())) except Exception: succ = False self._logger.error("Convert data to Task error:\ndata:%s\nex:%s" % (data._source, traceback.format_exc())) finally: if not succ and not data is None: data.on_complete(False)
def manage_task(self, task: Task): """ 对队列中的任务进行处理 :param task: :return: """ with self._spider_manage_dealing_queue_locker: if self._spider_manage_queue_dict.__contains__(task.batchid): self._logger.error("Task is on dealing: {}".format(task.batchid)) return self._spider_manage_queue_dict[task.batchid] = task task.on_complete = self.on_complete try: # 进行任务处理 self._distribute_tasktype(task) except: self._logger.error(f"Manage idown task error\nerr:{traceback.format_exc()}") finally: # 因为一些意外情况可能会导致任务提前结束,导致任务仍在处理队列 if task is not None: if callable(task.on_complete): task.on_complete(task)
def _convert(self, data: InputData) -> iter: """将中心下发的任务转换为自有的通用任务结构Task体枚举(一个文件可能有多个任务段)""" try: if data.stream is None or not data.stream.readable(): self._logger.error( "Data stream is None when trying to convert to standard Task: %s" % data._source) return for dicseg in self._get_segments(data): if dicseg is None or len(dicseg._fields) < 1: continue try: # 创建任务对象 tsk: Task = Task(dicseg._fields) tsk.segindex = dicseg.segindex tsk.segline = dicseg.segline if tsk is None or not isinstance(tsk, Task): self._logger.error("Parse task failed.") continue # 检查token类型是否匹配 tokentype: ETokenType = tsk.get_real_tokentype() if tokentype == ETokenType.Unknown: self._logger.error( "Invalid token in task: {} {}".format( tsk.taskid, tsk.tasktype)) continue yield tsk except Exception: self._logger.error( "Generate Task from dic fields error:\ndata:%s\nex:%s" % (data._source, traceback.format_exc())) if not data is None: data.on_complete(False) except Exception: self._logger.error("Convert data to Task error:\ndata:%s\nex:%s" % (data._source, traceback.format_exc())) if not data is None: data.on_complete(False)
def _construct_task(self, filedata: dict) -> Task: """ 函数设计不合理,为什么构造Task会和put_task_to_queue放在一起?? 单独构造task """ tsk: Task = Task(filedata) # 加载设置,包括补齐默认设置 if tsk.cmd_id is None: tsk.cmd = self.d_cmd else: tsk.cmd.fill_defcmd(self.d_cmd) # 初始化task数据 tsk._other_fields = json.loads(filedata["otherfileds"]) # 为啥这里要赋值,不是在task里面给了吗? tsk._sequence = filedata["sequence"] # 使用配置里面的优先级,优先级只有在加入下载队列的时候才会使用, tsk.priority = tsk.cmd.stratagy.priority tsk.on_complete = self.on_task_complete return tsk
def _get_logout_data_info(self, tsk: Task): # 查询有效任务的cookie res = False sql = ''' select * from task where taskid=? and batchid=? ''' pars = ( tsk.parenttaskid, tsk.parentbatchid, ) res_info = self._sqlfunc.query_task_by_sql(sql, pars) if len(res_info) == 0: return res res_one = res_info[0] # 取出来的数据有cookie,并且目前的数据没有cookie就通过cookie赋值 if res_one.get('cookie') is not None: self._logger.info(f"Apptype:{tsk.apptype} will logout out") res = True tsk.cookie = res_one.get('cookie') return res
def _taskparse(self, data: dict, file_suffix: str): """ 用于判断不同的任务类型 目前有:idowntask idowncmd 后面应该根据文件的后缀来判断文件类型 所以后面这个方法要改 add by judy 2019/06/11 :param data: :return: """ if file_suffix is None: raise Exception( "To distinguish file types, file_suffix can not be None.") # 单独处理an_dns的数据 add by judy 2020/03/04 # if file_suffix == 'an_dns_client': # return DnsData(data) # 初始化数据要增加clientid,这样无论是task,和idowncmd就会有clientid了 data['clientid'] = basic_client_config.clientid if file_suffix == 'idown_task': return Task(data) elif file_suffix == 'idown_cmd': return IdownCmd.parse_from_dict(data) elif file_suffix == 'iscan_task': return IscanTask(data) elif file_suffix == 'iscout_task': return IscoutTask(data) elif file_suffix == 'automated_task': return AutomatedTask.create_from_dict(data) # -------------------------------------这些东西目前改了,用后缀来判断任务类型,这样更准确些 # if data.get('taskid') is not None: # taskid 不为空目前一定是task # return Task(data) # elif data.get('taskid') is None and data.get('cmdid') is not None: # 没有taskid但是有cmdid # return IdownCmd.parse_from_dict(data) else: # 希望别走到这,走到这就说明这个任务解析错了 self._logger.error('Unkown task type')
def _get_cookie_keep_tasks(self): """ 从数据库获取需要进行cookie保活的任务 不断从数据库取出需要进行保活的任务,放入处理队列 :return: """ while True: # self._logger.debug(f"Scan cookie in database") all_tasks = self._selcet_cookie_keep_tasks() if len(all_tasks) == 0: # 60秒扫描一次数据库 time.sleep(60) continue try: for task_dict in all_tasks: if self._judge_task_to_queue(task_dict): # 实例化task的时候都需要加载设置 tsk: Task = Task(task_dict) if tsk.cmd_id is None: tsk.cmd = self.d_cmd else: tsk.cmd.fill_defcmd(self.d_cmd) # ------------------------------------- if self._dealing_queue.__contains__(tsk.batchid): continue with self._dealing_queue_locker: self._dealing_queue[tsk.batchid] = tsk self._cookie_queue.put(tsk) self._logger.info( f"Get a cookie keep task\ntaskid:{tsk.taskid}\nbatchid:{tsk.batchid}" ) except: self._logger.error( f"Make the task from sqlite wrong, err:{traceback.format_exc()}" ) finally: time.sleep(1)
def _generate_task_back_percent(self, task: Task) -> bool: """返回总任务百分比""" res: bool = False try: currpercent = math.floor(task.batchcompletecount / task.batchtotalcount * 100) if task.cmdstatus != ECommandStatus.Dealing and \ task.cmdstatus != ECommandStatus.NeedSmsCode and \ task.cmdstatus != ECommandStatus.Progress and \ task.cmdstatus !=ECommandStatus.WaitForSend: task.progress = 1 lastprogress = task.progress * 100 if task.batchtotalcount > 100: # 自任务数大于100个的,每1%返回一次 if currpercent - lastprogress < 1: res = True elif 50 < task.batchtotalcount <= 100: # 50<x<=100个子任务的,总共返回25个百分比文件,每%4返回一次 if currpercent - lastprogress < 4: res = True else: # 0<x<=50个子任务的,每%10左右返回一次 if currpercent - lastprogress < 10: res = True if res: return res # 最新总任务Progress更新到数据库并发送回馈数据 task.progress = currpercent / 100 task.cmdstatus = ECommandStatus.Progress if not self._dbmanager.update_idown_task2(task): self._logger.error( "Update Task with progress failed: taskid={} tasktype={} progress={}" .format(task.taskid, task.tasktype.name, task.progress)) return res taskback: TaskBack = TaskBack( task, ECommandStatus.Progress, cmdrcvmsg='{}%'.format(currpercent), batchcompletecount=task.batchcompletecount) if not OutputManagement.output(taskback): res = False self._logger.error( "Output TaskBack progress failed:\ntaskid:{}\ntasktype:{}\ncmdstatus:{}\nprogress:{}" .format(task.taskid, task.tasktype, task._cmdstatus.name, currpercent)) return res res = True self._logger.info( "TaskBack generated, cmdstatus=[Progress {}%]:\ntaskid={}\ntasktype:{}" .format(task.progress * 100, task.taskid, task.tasktype.name)) except Exception: res = False self._logger.error( "Generate TaskBack for BatchTask complete percent error:\ntaskid:{}\nerror:{}" .format(task.taskid, traceback.format_exc())) finally: pass return res
def _dispatch_by_token(self, task: Task, clients: dict) -> Tuple[bool, Client, str]: """搜索task的资源看有么有此条任务,并分配""" succ: bool = False res: Client = None msg: str = None try: # 业务逻辑: # 由于相同的令牌资源在分配时将会分配到固定的一个采集端, # 所以在根据令牌资源搜索采集端时将只会有一个结果。 # 然后将原有任务关联到当前任务的parenttaskd和parentbatchid。 # 过来这里必然是 tasktype=ETaskType.Logout的 # 先看是不是可以搜索的资源类型 if not isinstance( task.tokentype, ETokenType ) or (task.tokentype != ETokenType.Sms and \ task.tokentype!=ETokenType.Pwd and \ task.tokentype != ETokenType.SmsPwd and \ task.tokentype!=ETokenType.Cookie): self._logger.error( "Invalid tokentype for dispatching by token search:\ntaskid:{}\nbatchid:{}\ntokentype:{}" .format(task.taskid, task.batchid, task.tokentype)) msg = '无效的令牌资源类型' return (succ, res, msg) # 按资源类型搜索资源 existtask: Task = DbManager.get_task_by_search_token( platform=task._platform, apptype=task.apptype, tokentype=task.tokentype, input_=task.input, preglobaltelcode=task.preglobaltelcode, preaccount=task.preaccount, globaltelcode=task.globaltelcode, phone=task.phone, account=task.account, password=task.password, url=task.url, host=task.host, cookie=task.cookie, ) if not isinstance(existtask, Task): self._logger.error( "Get client by search clientid from token searching failed:\ntaskid:{}\nbatchid:{}\nclientid:{}" .format(task.taskid, task.batchid, task._clientid)) return (succ, res, msg) existclient: Client = DbManager.get_client_status( existtask._clientid) if not isinstance(existclient, Client): self._logger.error( "Got client by search clientid from token searching is invalid:\ntaskid:{}\nbatchid:{}\nclientid:{}" .format(task.taskid, task.batchid, task._clientid)) return (succ, res, msg) # 关联原任务taskid/batchid到当前任务parenttaskid/parentbatchid task.parenttaskid = existtask.taskid task.parentbatchid = existtask.batchid res = existclient succ = True except Exception: self._logger.error( "Dispatch by token error:\ntaskid:{}\nerror:{}".format( task.taskid, traceback.format_exc())) msg = '根据令牌资源分配任务出错' return (succ, res, msg)