Exemplo n.º 1
0
def checkLz(subtaskId):
    try:
        # 根据任务结果编号获取本次需要检查的任务记录
        subTask = TaskInfo.getOne(TaskInfo.id == subtaskId)
        checkTaskResult = subTask.taskResultId
        bigTaskId = checkTaskResult.taskId.taskId
        if subTask is not None:
            companyName = subTask.webId.regID.coname
            url = subTask.webId.domain
            if (validateUrl(url)):
                # 从任务表中获取是否已经进行检查过了(根据网站有没有更新时间进行判断)
                isExistsTask = TaskInfo.getOne(
                        TaskInfo.webId == Website.getOne((Website.domain == url) & (Website.updateDate.is_null(False))))
                subId = subTask.id
                # 数据库中无此task对应的网站记录
                if isExistsTask is None:
                    # 抓取检测
                    fetchWebsite(companyName, bigTaskId,subId, url)
                else:
                    isExistsWebId = isExistsTask.webId.webId
                    # 如果当前网站更新时间小于过期时间,说明不用重新进行抓取并检查
                    expired = Configs.getOne(Configs.type == 'update').expired
                    # 获取当前时间
                    currentTime = datetime.datetime.now()
                    isAlreadyExistsWeb(companyName, currentTime, expired, url, subId,
                                       url,
                                       isExistsWebId)
    except Exception:
        q = TaskInfo.update(state='-1')
        q.execute()
Exemplo n.º 2
0
def isAlreadyExistsWeb(companyName, currentTime, expired, shortUrl, subTaskId, url, isExistsWebId):
    subTask = TaskInfo.getOne(TaskInfo.id == subTaskId)
    taskResult = subTask.taskResultId
    bigTask = taskResult.taskId
    bigTaskId = bigTask.taskId
    # 获取当前网站上一次的更新时间
    websiteResult = Website.getOne(Website.webId == isExistsWebId)
    webUpdateTime = websiteResult.updateDate
    if webUpdateTime is None:
        diffDay = expired + 1
    else:
        diffDay = (currentTime - webUpdateTime).days
    if diffDay > expired:
        print '时间已经过期'
        # 网站信息过期,需要重新抓取并检测
        fetchWebsite(companyName, bigTaskId, subTaskId, url)
    else:
        # 判断是否有当前网站信息是否存在亮照编号
        if websiteResult.licID != '':
            #logger.debug('已经亮照:', shortUrl)
            q = TaskInfo.update(state='2').where(TaskInfo.id == subTaskId)
            q.execute()
        else:
            #logger.debug('未亮照:', shortUrl)
            q = TaskInfo.update(state='3').where(TaskInfo.id == subTaskId)
            q.execute()
Exemplo n.º 3
0
def judgeLzResult(com, web, shortUrl, subTask):
    # 更新网站信息和公司信息
    impCompanyInfo(com)
    # impWebsite(web)
    # nWeb = Website.getOne(Website.domain == web.domain)

    # judgeWeb = Website.getOne(Website.domain ** str.format("%{}%", shortUrl))

    # 如果查询网址与抓取亮照后的网址不匹配
    if shortUrl != '':
        shortUrl = shortUrl.replace('http://', '').replace(' ', '')
    print  'shortUrl:', shortUrl
    print "web.domain:", web.domain
    com = Company.getOne(Company.coname == com.coname)
    if shortUrl != web.domain:
        # 当已经存在跳转关系记录时,不再操作
        existsJumpWeb = Website.getOne((Website.domain == web.domain))
        if existsJumpWeb is None:
            dt = format(datetime.datetime.now(), '%Y-%m-%d %H:%M:%S')
            web.updateDate = dt
            web.regID = com
            web.save(force_insert=True)
            # 更新网站跳转地址
            q = Website.update(jump=web).where(Website.webId == subTask.webId.webId)
            q.execute()
        else:
            impWebsite(web)
            # 更新网站跳转地址
            q = Website.update(jump=existsJumpWeb).where(Website.webId == subTask.webId.webId)
            q.execute()
    else:
        #更新网站信息
        impWebsite(web)

    count = Website.select().where(
            (Website.licID != '') & (Website.regID == com) & (Website.domain == shortUrl)
    ).count()
    if count == 0:
        onlyCname = Website.select().where(
                (Website.licID != '') & (Website.regID == com)).count()
        onlyDomain = Website.select().where(
                (Website.licID != '') & (Website.domain == shortUrl)).count()
        if onlyCname > 0:
            q = TaskInfo.update(state='9').where(TaskInfo.id == subTask.id)
            q.execute()
        elif onlyDomain > 0:
            q = TaskInfo.update(state='8').where(TaskInfo.id == subTask.id)
            q.execute()
        else:
            q = TaskInfo.update(state='4').where(TaskInfo.id == subTask.id)
            q.execute()
    else:
        q = TaskInfo.update(state='2').where(TaskInfo.id == subTask.id)
        q.execute()
Exemplo n.º 4
0
def singalCheck(taskResultId):
    # 根据任务编号查询所有任务是否已经完成
    executeCount = TaskInfo.select().where(
            (TaskInfo.taskResultId == taskResultId) & (TaskInfo.state != 1) & (TaskInfo.state != 6)).count()
    subTaskResult = TaskInfo.select().where(TaskInfo.taskResultId == taskResultId).count()
    print 'subTaskResult', subTaskResult
    print 'executeCount', executeCount
    if executeCount == subTaskResult and executeCount != 0:
        #将本次任务检查结果明细生成到result目录
        checkTaskResult = TaskResult.getOne(TaskResult.taskResultId == taskResultId)
        taskId = checkTaskResult.taskId.taskId
        result_taskresultid = checkTaskResult.taskResultId
        taskCount = TaskInfo.select().where(
                (TaskInfo.taskResultId == taskResultId)).count()
        onceCount = config.getTaskResultCount
        packCount = int(math.ceil(float(taskCount)/float(onceCount)))
        #生成检查结果明细文件
        print '开始生成检查结果明细文件'
        print 'packCount:',packCount
        genTaskResultFile(taskId,result_taskresultid,packCount)


        print '单次任务检查完毕结束当前任务,开始发送邮件通知'
        # 说明该任务结果已经发送完毕,从apscheduler任务调度中删除该任务
        scheduler.remove_job(str(taskResultId))

        bigTaskId = checkTaskResult.taskId.taskId
        taskResultId = checkTaskResult.taskResultId
        # 更新上一次子任务的状态
        oTime = format(datetime.datetime.now(), '%Y-%m-%d %H:%M:%S')
        q = TaskResult.update(state="2", overTime=oTime).where(
                TaskResult.taskResultId == taskResultId)
        q.execute()
        #修改任务结果对应的大任务状态为已完成
        q = Task.update(state="2").where(Task.taskId==taskId)
        q.execute()
        # 任务执行完毕后发送邮件通知
        mutil = MailUtil()
        # 获取当前任务绑定的邮箱账号
        toEmail = TaskResult.getOne(TaskResult.taskResultId == taskResultId).taskId.userId.email
        if toEmail is not None:
            from_addr = config.SEND_EMAIL
            password = config.SEND_EMAIL_PASSWORD
            to_addr = toEmail
            smtp_server = config.SMTP_SERVER
            msg = str.format(config.MAIL_NOTICE, bigTaskId, bigTaskId, str(taskResultId))
            subject = config.MAIL_SUBJECT
            mutil.sendMail.delay(from_addr, password, to_addr, smtp_server, msg, subject)
        else:
            print "该任务未绑定接收邮箱,任务结果编号:",taskResultId
Exemplo n.º 5
0
def intervalFetch(taskResultId, lastTaskResultId, delayTag):
    # 获取上次任务taskResultId
    taskInfos = TaskInfo.select().where(TaskInfo.taskResultId == lastTaskResultId)
    for task in taskInfos:
        subTask = TaskInfo()
        subTask.taskResultId = taskResultId
        subTask.state = ''
        subTask.cname = task.cname
        subTask.url = task.url
        subTask.save(force_insert=True)
        fetchCycle.apply_async((subTask.id, taskResultId, delayTag), queue="celery")
Exemplo n.º 6
0
def executeOnceTaskInfo(taskResultId):
    # 获取celery中当前已经正在进行的任务数
    nowCount = TaskInfo.select().where((TaskInfo.state == '6')).count()
    # 当celery中正在做的任务数量少于指定的数量时,向celery添加需要执行的任务
    if nowCount <= config.celeryMaxCount:
        taskCount = TaskInfo.select().where(
                (TaskInfo.state == '1') & (TaskInfo.taskResultId == taskResultId)).count()
        print 'taskCount:', taskCount
        if taskCount == 0:
            singalCheck(taskResultId)
        else:
            tasks = TaskInfo.select().order_by(TaskInfo.id).paginate(0, config.sendCeleryCount).where(
                    (TaskInfo.taskResultId == taskResultId) & (TaskInfo.state == '1'))
            for subTask in tasks:
                subtaskId = subTask.id
                checkLz.apply_async((subtaskId,), queue="celery")
                # 更新taskinfo状态为已发送
                q = TaskInfo.update(state='6').where(TaskInfo.id == subtaskId)
                q.execute()
Exemplo n.º 7
0
def fetchCycle(subtaskId, taskResultId, delayTag):
    if delayTag:
        checkTaskResult = TaskResult.getOne(TaskResult.taskResultId == taskResultId)
        bigTaskId = checkTaskResult.taskId.taskId
        # 根据任务结果编号获取本次需要检查的任务记录
        subTask = TaskInfo.getOne(TaskInfo.id == subtaskId)
        if subTask is not None:
            companyName = subTask.cname
            url = subTask.url
            # 抓取检测
            fetchWebsite(companyName, bigTaskId, subTask.id, url)
        else:
            print 'taskinfo记录为空:', subTask
Exemplo n.º 8
0
def noAccess(dirPath, filePath, mainTask, shortUrl, subTask):
    dt = format(datetime.datetime.now(), '%Y-%m-%d %H:%M:%S')
    qw = Website.update(updateDate=dt).where(Website.webId == subTask.webId.webId)
    qw.execute()

    q = TaskInfo.update(state='7').where(TaskInfo.id == subTask.id)
    q.execute()
    # 删除此首页文件
    if os.path.exists(filePath):
        os.remove(filePath)
    # 删除本次任务目录
    if os.path.exists(dirPath):
        os.rmdir(dirPath)
Exemplo n.º 9
0
def checkAllLz(filePath, taskResultId):

    taskResult = TaskResult.getOne(TaskResult.taskResultId == taskResultId)
    delayDay = taskResult.taskId.intervalDay
    #根据当前任务编号目录获取该目录下的所有xml文件
    print '目录路径:',filePath
    fnames=os.listdir(filePath)
    for name in fnames:
        xmlName = filePath+'/'+name
        print 'xmlName:',xmlName
        data = parseXMLFile(xmlName, 'CheckItem')
        print 'data:',data
        # 将需要检查的信息入库
        for d in data:
            cname = d.get('cname')
            url = d.get('url')
            if url != '':
                if url[-1] == '/':
                    url = url.replace('http://', '')[0:-1].replace(' ','')
            if cname!='':
                cname = cname.replace(' ','')
            webArea = d.get('area')
            webtype = d.get('WebType')
            # 检查更新company
            if Company.getOne(Company.coname == cname) is None:
                c = Company()
                c.coname = cname
                c.save(force_insert=True)
            # 检查更新website
            if Website.getOne(Website.domain == url) is not None:
                q = Website.update(domain=url, type=webtype, area=webArea).where(Website.domain == url)
                q.execute()
            else:
                com = Company.getOne(Company.coname == cname)
                w = Website()
                w.regID = com
                w.domain = url
                w.area = webArea
                w.type = webtype
                w.save(force_insert=True)
            updateWeb = Website.getOne(Website.domain == url)
            subTask = TaskInfo()
            subTask.taskResultId = taskResult
            subTask.webId = updateWeb
            subTask.state = '1'
            subTask.save(force_insert=True)

    taskResultId = str(taskResultId)
    if delayDay > 0:
        # 需要周期执行的任务
        executeMultiTaskInfo(taskResultId)
    else:
        #logger.debug("开始调用单次任务")
        # 单次执行的任务
        executeSingleTaskInfo(taskResultId)
Exemplo n.º 10
0
def genTaskResultFile(taskId,taskResultId,packCount):
    onceCount = config.getTaskResultCount
    for i in range(packCount):
        taskinfos = TaskInfo.select().order_by(TaskInfo.id).paginate(i+1, onceCount).where(
                        (TaskInfo.taskResultId == taskResultId))
        listTag = ET.Element('CheckList')
        #1 – 正常亮照
        # -1 – 未亮照
        # -2 – 无法访问
        # -3 – 亮照信息错误
        # -4:公司名称不一致
        # -5:网址不一致
        # -6:抓取异常失败
        for t in taskinfos:
            checkItem = ET.SubElement(listTag, 'CheckItem')
            ET.SubElement(checkItem, 'Coname').text = t.webId.regID.coname
            ET.SubElement(checkItem, 'url').text = t.webId.domain
            ET.SubElement(checkItem, 'area').text = t.webId.area
            ET.SubElement(checkItem, 'WebType').text = t.webId.type
            checkResult = ""
            if t.state=="2":
                checkResult+="1"
            elif t.state=="3":
                checkResult+="-1"
            elif t.state=="4":
                checkResult+="-3"
            elif t.state=="5" or t.state=="7":
                checkResult+="-2"
            elif t.state=="8":
                checkResult+="-4"
            elif t.state=="9":
                checkResult+="-5"
            elif t.state=="-1":
                checkResult+="-6"
            ET.SubElement(checkItem, 'ALResult').text = t.state
        listTagString =ET.tostring(listTag,encoding="UTF-8").replace("<?xml version='1.0' encoding='UTF-8'?>","")
        dirPath = os.path.abspath(
            str.format(os.path.join('./result/{}/{}/'),  taskId, taskResultId))
        if not os.path.exists(dirPath):
            os.makedirs(dirPath)
        fname = '%s/%s.xml'%(dirPath,str(i+1))
        print 'fname:',fname
        f = open(fname,'a')
        f.write(listTagString)
        f.flush()
        f.close()
Exemplo n.º 11
0
def intervalDelayTask(taskResultId):
    # 获取celery中当前已经正在进行的任务数
    nowCount = TaskInfo.select().order_by(TaskInfo.id).where((TaskInfo.state == '6'))
    if nowCount >= 0:
        # 当celery中正在做的任务数量少于指定的数量时,向celery添加需要执行的任务
        if nowCount <= config.celeryMaxCount:
            taskCount = TaskInfo.select().where(
                    (TaskInfo.state == '1') & (TaskInfo.taskResultId == taskResultId)).count()
            print 'taskCount:', taskCount
            if taskCount == 0:
                singalCheck(taskResultId)
                # 查询该任务设置的延迟时间开启下一次需要检查的任务
                interval = Task.getOne(Task.taskId == (
                    TaskResult.select(TaskResult.taskId).where(TaskResult.taskResultId == taskResultId))).intervalDay
                print 'interval:', interval
                if interval != "":
                    # 生成需要轮巡的新主任务结果记录
                    taskResult = TaskResult()
                    lastTaskResult = TaskResult.getOne(TaskResult.taskResultId == taskResultId)
                    taskResult.taskId = lastTaskResult.taskId
                    taskResult.state = '1'
                    taskResult.save()

                    #将上一次的任务结果编号所对应的webId指定给新的任务结果
                    psql_db.transaction()
                    try:
                        query = (TaskInfo
                            .insert_from(
                                fields=[TaskInfo.webId],
                                query=TaskInfo.select(TaskInfo.webId).where(TaskInfo.taskResultId == lastTaskResult)))
                        query.execute()
                        q = TaskInfo.update(taskResultId=taskResult).where(TaskInfo.taskResultId.is_null())
                        q.execute()
                    except Exception, e:
                        print e
                        psql_db.rollback()
                    # 获取当前时间
                    ctime = datetime.datetime.now()
                    delay_time = int(interval)
                    stime = ctime + datetime.timedelta(seconds=delay_time)
                    scheduler.add_job(intervalDelayTask, "date", next_run_time=stime, args=[taskResult.taskResultId],
                                      jobstore="default", id=taskResult.taskResultId)
            else:
                tasks = TaskInfo.select().order_by(TaskInfo.id).paginate(0, config.sendCeleryCount).where(
                        (TaskInfo.taskResultId == taskResultId) & (TaskInfo.state == '1'))
                for subTask in tasks:
                    subtaskId = subTask.id
                    fetchCycle.apply_async((subtaskId,), queue="celery")
                    # 更新taskinfo状态为已发送
                    q = TaskInfo.update(state='6').where(TaskInfo.id == subtaskId)
                    q.execute()
Exemplo n.º 12
0
def fetchLzPage(isLzUrl, lzPath, shortUrl, subTask):
    # 根据亮照的完整url进行抓取
    status = downloadByPath(isLzUrl, lzPath)
    if not os.path.exists(lzPath):
        print "lzpath:",lzPath
        #logger.debug('亮照页面无法访问:', isLzUrl)
        #dt = format(datetime.datetime.now(), '%Y-%m-%d %H:%M:%S')
        # qw = Website.update(updateDate=dt).where(Website.webId == subTask.webId.webId)
        # qw.execute()
        q = TaskInfo.update(state='5', remark=isLzUrl).where(TaskInfo.id == subTask.id)
        q.execute()
    else:
        try:
            f = open(lzPath, 'r')
            parseData = parserCompanyAndWeb(f.read())
            com = buildCompany(parseData['company'])
            tempBuildWeb = buildWebsite(parseData['web'])
            judgeLzResult(com, tempBuildWeb, shortUrl, subTask)
        except Exception, e:
            print e
Exemplo n.º 13
0
def fetchWebsite(companyName, bigTaskId, subTaskId, url):
    mainTask = Task.getOne(Task.taskId == bigTaskId)
    subTask = TaskInfo.getOne(TaskInfo.id == subTaskId)
    # 每一个新的任务存放首页的目录格式为:网站域名+任务编号+任务结果编号+企业首页信息------------->如:www.yummy77.com/123456789/1/www.yummy77.com.html
    dirPath = os.path.abspath(
            str.format(os.path.join('./data/{}/{}/{}/'), url, bigTaskId, subTask.taskResultId.taskResultId))
    if not os.path.exists(dirPath):
        os.makedirs(dirPath)

    filePath = dirPath + '/' + url + '.html'
    # 如果url中没有http://,则对其进行添加
    if url.find('http://') == -1:
        url = "http://%s" % url
    # 进行抓取操作,然后对其进行亮照结果筛选
    status = downloadByPhantom('/usr/bin/phantomjs', os.path.abspath('./phantomjs/fetch.js'),
                               url, filePath, 'utf-8', str(config.request_timeout), str(config.timeout), '', '')
    if not os.path.exists(filePath):
        #logger.debug('主页无法访问:', url)
        # 无法访问
        noAccess(dirPath, filePath, mainTask, url, subTask)
    else:
        # 检查网站首页是否包含工商亮照标识
        webContent = open(filePath, 'r').read()
        # 返回的结果类似于<html><head></head><body></body></html>
        noContent = re.match('.*<body></body>', webContent)
        # 返回空白结果
        blankContent = len(webContent)
        # 返回纠错网址结果
        errorSite = webContent.find('网址纠错')
        # 网站主页无法访问
        if noContent is not None or blankContent == 0 or errorSite != -1:
            # 按照无法访问处理
            noAccess(dirPath, filePath, mainTask, url, subTask)
        else:
            # 对网站首页进行解析并检查
            makeWeb(companyName, filePath, mainTask, url, subTask)
Exemplo n.º 14
0
def judgeNoLz(companyName, shortUrl, subTask):
    dt = format(datetime.datetime.now(), '%Y-%m-%d %H:%M:%S')
    qw = Website.update(updateDate=dt).where(Website.webId == subTask.webId.webId)
    qw.execute()
    q = TaskInfo.update(state='3').where(TaskInfo.id == subTask.id)
    q.execute()