def isAlreadyExistsWeb(companyName, currentTime, expired, shortUrl, subTaskId, url, isExistsWebId): subTask = TaskInfo.getOne(TaskInfo.id == subTaskId) taskResult = subTask.taskResultId bigTask = taskResult.taskId bigTaskId = bigTask.taskId # 获取当前网站上一次的更新时间 websiteResult = Website.getOne(Website.webId == isExistsWebId) webUpdateTime = websiteResult.updateDate if webUpdateTime is None: diffDay = expired + 1 else: diffDay = (currentTime - webUpdateTime).days if diffDay > expired: print '时间已经过期' # 网站信息过期,需要重新抓取并检测 fetchWebsite(companyName, bigTaskId, subTaskId, url) else: # 判断是否有当前网站信息是否存在亮照编号 if websiteResult.licID != '': #logger.debug('已经亮照:', shortUrl) q = TaskInfo.update(state='2').where(TaskInfo.id == subTaskId) q.execute() else: #logger.debug('未亮照:', shortUrl) q = TaskInfo.update(state='3').where(TaskInfo.id == subTaskId) q.execute()
def judgeLzResult(com, web, shortUrl, subTask): # 更新网站信息和公司信息 impCompanyInfo(com) # impWebsite(web) # nWeb = Website.getOne(Website.domain == web.domain) # judgeWeb = Website.getOne(Website.domain ** str.format("%{}%", shortUrl)) # 如果查询网址与抓取亮照后的网址不匹配 if shortUrl != '': shortUrl = shortUrl.replace('http://', '').replace(' ', '') print 'shortUrl:', shortUrl print "web.domain:", web.domain com = Company.getOne(Company.coname == com.coname) if shortUrl != web.domain: # 当已经存在跳转关系记录时,不再操作 existsJumpWeb = Website.getOne((Website.domain == web.domain)) if existsJumpWeb is None: dt = format(datetime.datetime.now(), '%Y-%m-%d %H:%M:%S') web.updateDate = dt web.regID = com web.save(force_insert=True) # 更新网站跳转地址 q = Website.update(jump=web).where(Website.webId == subTask.webId.webId) q.execute() else: impWebsite(web) # 更新网站跳转地址 q = Website.update(jump=existsJumpWeb).where(Website.webId == subTask.webId.webId) q.execute() else: #更新网站信息 impWebsite(web) count = Website.select().where( (Website.licID != '') & (Website.regID == com) & (Website.domain == shortUrl) ).count() if count == 0: onlyCname = Website.select().where( (Website.licID != '') & (Website.regID == com)).count() onlyDomain = Website.select().where( (Website.licID != '') & (Website.domain == shortUrl)).count() if onlyCname > 0: q = TaskInfo.update(state='9').where(TaskInfo.id == subTask.id) q.execute() elif onlyDomain > 0: q = TaskInfo.update(state='8').where(TaskInfo.id == subTask.id) q.execute() else: q = TaskInfo.update(state='4').where(TaskInfo.id == subTask.id) q.execute() else: q = TaskInfo.update(state='2').where(TaskInfo.id == subTask.id) q.execute()
def intervalDelayTask(taskResultId): # 获取celery中当前已经正在进行的任务数 nowCount = TaskInfo.select().order_by(TaskInfo.id).where((TaskInfo.state == '6')) if nowCount >= 0: # 当celery中正在做的任务数量少于指定的数量时,向celery添加需要执行的任务 if nowCount <= config.celeryMaxCount: taskCount = TaskInfo.select().where( (TaskInfo.state == '1') & (TaskInfo.taskResultId == taskResultId)).count() print 'taskCount:', taskCount if taskCount == 0: singalCheck(taskResultId) # 查询该任务设置的延迟时间开启下一次需要检查的任务 interval = Task.getOne(Task.taskId == ( TaskResult.select(TaskResult.taskId).where(TaskResult.taskResultId == taskResultId))).intervalDay print 'interval:', interval if interval != "": # 生成需要轮巡的新主任务结果记录 taskResult = TaskResult() lastTaskResult = TaskResult.getOne(TaskResult.taskResultId == taskResultId) taskResult.taskId = lastTaskResult.taskId taskResult.state = '1' taskResult.save() #将上一次的任务结果编号所对应的webId指定给新的任务结果 psql_db.transaction() try: query = (TaskInfo .insert_from( fields=[TaskInfo.webId], query=TaskInfo.select(TaskInfo.webId).where(TaskInfo.taskResultId == lastTaskResult))) query.execute() q = TaskInfo.update(taskResultId=taskResult).where(TaskInfo.taskResultId.is_null()) q.execute() except Exception, e: print e psql_db.rollback() # 获取当前时间 ctime = datetime.datetime.now() delay_time = int(interval) stime = ctime + datetime.timedelta(seconds=delay_time) scheduler.add_job(intervalDelayTask, "date", next_run_time=stime, args=[taskResult.taskResultId], jobstore="default", id=taskResult.taskResultId) else: tasks = TaskInfo.select().order_by(TaskInfo.id).paginate(0, config.sendCeleryCount).where( (TaskInfo.taskResultId == taskResultId) & (TaskInfo.state == '1')) for subTask in tasks: subtaskId = subTask.id fetchCycle.apply_async((subtaskId,), queue="celery") # 更新taskinfo状态为已发送 q = TaskInfo.update(state='6').where(TaskInfo.id == subtaskId) q.execute()
def checkLz(subtaskId): try: # 根据任务结果编号获取本次需要检查的任务记录 subTask = TaskInfo.getOne(TaskInfo.id == subtaskId) checkTaskResult = subTask.taskResultId bigTaskId = checkTaskResult.taskId.taskId if subTask is not None: companyName = subTask.webId.regID.coname url = subTask.webId.domain if (validateUrl(url)): # 从任务表中获取是否已经进行检查过了(根据网站有没有更新时间进行判断) isExistsTask = TaskInfo.getOne( TaskInfo.webId == Website.getOne((Website.domain == url) & (Website.updateDate.is_null(False)))) subId = subTask.id # 数据库中无此task对应的网站记录 if isExistsTask is None: # 抓取检测 fetchWebsite(companyName, bigTaskId,subId, url) else: isExistsWebId = isExistsTask.webId.webId # 如果当前网站更新时间小于过期时间,说明不用重新进行抓取并检查 expired = Configs.getOne(Configs.type == 'update').expired # 获取当前时间 currentTime = datetime.datetime.now() isAlreadyExistsWeb(companyName, currentTime, expired, url, subId, url, isExistsWebId) except Exception: q = TaskInfo.update(state='-1') q.execute()
def noAccess(dirPath, filePath, mainTask, shortUrl, subTask): dt = format(datetime.datetime.now(), '%Y-%m-%d %H:%M:%S') qw = Website.update(updateDate=dt).where(Website.webId == subTask.webId.webId) qw.execute() q = TaskInfo.update(state='7').where(TaskInfo.id == subTask.id) q.execute() # 删除此首页文件 if os.path.exists(filePath): os.remove(filePath) # 删除本次任务目录 if os.path.exists(dirPath): os.rmdir(dirPath)
def executeOnceTaskInfo(taskResultId): # 获取celery中当前已经正在进行的任务数 nowCount = TaskInfo.select().where((TaskInfo.state == '6')).count() # 当celery中正在做的任务数量少于指定的数量时,向celery添加需要执行的任务 if nowCount <= config.celeryMaxCount: taskCount = TaskInfo.select().where( (TaskInfo.state == '1') & (TaskInfo.taskResultId == taskResultId)).count() print 'taskCount:', taskCount if taskCount == 0: singalCheck(taskResultId) else: tasks = TaskInfo.select().order_by(TaskInfo.id).paginate(0, config.sendCeleryCount).where( (TaskInfo.taskResultId == taskResultId) & (TaskInfo.state == '1')) for subTask in tasks: subtaskId = subTask.id checkLz.apply_async((subtaskId,), queue="celery") # 更新taskinfo状态为已发送 q = TaskInfo.update(state='6').where(TaskInfo.id == subtaskId) q.execute()
def fetchLzPage(isLzUrl, lzPath, shortUrl, subTask): # 根据亮照的完整url进行抓取 status = downloadByPath(isLzUrl, lzPath) if not os.path.exists(lzPath): print "lzpath:",lzPath #logger.debug('亮照页面无法访问:', isLzUrl) #dt = format(datetime.datetime.now(), '%Y-%m-%d %H:%M:%S') # qw = Website.update(updateDate=dt).where(Website.webId == subTask.webId.webId) # qw.execute() q = TaskInfo.update(state='5', remark=isLzUrl).where(TaskInfo.id == subTask.id) q.execute() else: try: f = open(lzPath, 'r') parseData = parserCompanyAndWeb(f.read()) com = buildCompany(parseData['company']) tempBuildWeb = buildWebsite(parseData['web']) judgeLzResult(com, tempBuildWeb, shortUrl, subTask) except Exception, e: print e
def judgeNoLz(companyName, shortUrl, subTask): dt = format(datetime.datetime.now(), '%Y-%m-%d %H:%M:%S') qw = Website.update(updateDate=dt).where(Website.webId == subTask.webId.webId) qw.execute() q = TaskInfo.update(state='3').where(TaskInfo.id == subTask.id) q.execute()