Пример #1
0
def updateTaskJob(taskJob):
    taskJobDb = Session.query(TaskJob).filter(TaskJob.id == taskJob.id).first()
    LoggerDao.addTaskJobLogger(taskJobDb,
                               LoggerDao.LoggerType.MOD,
                               content=u"更新任务",
                               status=taskJobDb.status)
    return BaseDao.updateByPrimary(taskJob, TaskJob)
Пример #2
0
def checkFinishJob():
    keys=RedisUtils.hkeys(ConfigUtils.getRedisPorperties(KEYMAP.FINISH_SPIDER_REDIS_KEY))
    for key in keys :
        temp=RedisUtils.hget(ConfigUtils.getRedisPorperties(KEYMAP.FINISH_SPIDER_REDIS_KEY), key)
        newJobTemplate=json.loads(temp)
        url=newJobTemplate['url']
        try:
            request = urllib2.Request(
                url=url,
                headers=(random.choice(user_agent_list))
            )
            response = urllib2.urlopen(request)
            urldate = response.headers['date']
            tempDate= newJobTemplate['urldate']
            print urldate
            print tempDate
            if urldate == tempDate:
                pass
            else:
                newJobTemplate['urldate']=urldate

                taskJobHistoryId = newJobTemplate['taskJobHistoryId']
                taskJobHistory=Session.query(TaskJobHistory).filter(TaskJobHistory.id==taskJobHistoryId,TaskJobHistory.delFlag==False).order_by(" create_time desc").first()
                taskJob=Session.query(TaskJob).filter(TaskJob.id==taskJobHistory.taskJobId).first()
                LoggerDao.addTaskJobLogger(taskJob,LoggerDao.LoggerType.URL_TO_REDIS,
                                           jobTemplateId=newJobTemplate['id'],taskJobHistoryId=taskJobHistoryId,
                                           content=u"redis_入库",url=url,status=TaskStatus.RUNNING)

                RedisUtils.lpush(ConfigUtils.getRedisPorperties(KEYMAP.ASSIST_SPIDER_REDIS_KEY), taskJobHistoryId)
                RedisUtils.lpush(ConfigUtils.getRedisPorperties(KEYMAP.ASSIST_SPIDER_REDIS_KEY) + "_" + taskJobHistoryId,stringify(newJobTemplate))
                RedisUtils.hset(ConfigUtils.getRedisPorperties(KEYMAP.FINISH_SPIDER_REDIS_KEY), newJobTemplate['id'],stringify(newJobTemplate))
        except Exception,e:
            pass
            print e
Пример #3
0
def delTaskJob(taskId):
    taskIds = taskId.split(",")
    for i in range(len(taskIds)):
        LoggerDao.addTaskJobLogger(TaskJob(id=taskIds[i]),
                                   LoggerDao.LoggerType.DEL,
                                   content=u"删除任务")
    Session.query(TaskJob).filter(TaskJob.id.in_(tuple(taskIds))).update(
        {TaskJob.delFlag: True}, synchronize_session='fetch')
Пример #4
0
def getTaskJobHistory():
    params = loadParams()
    taskJobId = params.get("taskJobId")
    pageNo = params.get("pageNo") or 1
    pageSize = params.get("pageSize") or 10
    result = LoggerDao.getTaskJobHistory(taskJobId, pageNo, pageSize)
    return ResponseUtils.parseResponse(0, {"result": result})
Пример #5
0
def addTaskJob(taskJob):
    taskJob.id = uuid.uuid1()
    createTime = taskJob.createTime
    tableName = taskJob.tableName
    taskJob.status = TaskStatus.WAITING
    if createTime == None:
        createTime = datetime.now()
    elif (type(createTime) == "string"):
        createTime = datetime.strptime(createTime, "%Y-%m-%d %H:%M").date()
    timeStr = createTime.strftime("%Y%m%d%H%M%S")
    if (tableName == None):
        tableName = "taskJob"
    taskJob.tableName = "%s_%s" % (tableName, timeStr)
    LoggerDao.addTaskJobLogger(taskJob,
                               LoggerDao.LoggerType.ADD,
                               content=u"创建任务",
                               status=taskJob.status)
    BaseDao.add(taskJob)
Пример #6
0
def startCrawlerByTaskJobId(jobId, taskJobParam=None):
    logging.info('------startCrawlerByTaskJobId-------%s' % jobId)
    taskJobHistory = TaskJobDao.addTaskJobHistroy(jobId,
                                                  TaskJobHistoryType.SINGLE)
    taskJob = query(TaskJob, text('id="' + str(jobId) + '"'), type=0)
    TaskJobDao.updateTaskJobStatus(jobId, TaskStatus.RUNNING)
    # tableName = jobTemplate.tableName
    jobTemplateId = None
    dbClient = DbClient()
    LoggerDao.addTaskJobLogger(taskJob,
                               LoggerDao.LoggerType.START,
                               taskJobHistoryId=taskJobHistory.id,
                               status=TaskStatus.RUNNING,
                               content=u"任务启动")
    try:
        if TaskType.SINGLE == str(taskJob.type):
            jobTemplate = TemplateDao.queryJobTemplate(taskJob.jobTemplateId)
            if jobTemplate == None:
                TaskJobDao.updateTaskJobStatus(jobId, TaskStatus.FAIL)
                LoggerDao.addTaskJobLogger(taskJob,
                                           LoggerDao.LoggerType.START,
                                           jobTemplateId=jobTemplateId,
                                           taskJobHistoryId=taskJobHistory.id,
                                           content=u"no jobTemplate")
                logging.error("no jobTemplate")
                return
            jobTemplateId = jobTemplate.id
            taskJobParam = crawlerbyKeyWord(taskJob, jobTemplate.id)
            jobTemplateFieldList = TemplateDao.queryJobTemplateFieldByJobTemplateId(
                jobTemplate.id)
            dbClient.getConnection(taskJob.databaseId
                                   or jobTemplate.databaseId)
            if not dbClient.isTableExist(jobTemplate.tableName):
                dbClient.createTable(jobTemplate.id, jobTemplate.tableName,
                                     jobTemplateFieldList)
            setattr(jobTemplate, "url", taskJob.url)
            setattr(jobTemplate, "tableName", taskJob.tableName)
            LoggerDao.addTaskJobLogger(taskJob,
                                       LoggerDao.LoggerType.START,
                                       jobTemplateId=jobTemplate.id,
                                       taskJobHistoryId=taskJobHistory.id,
                                       status=TaskStatus.RUNNING,
                                       content=u"定向任务任务启动")
            parseUrlAndInsertRedis(taskJob,
                                   taskJobParam=taskJobParam,
                                   taskJobHistory=taskJobHistory,
                                   jobTemplate=jobTemplate)
        elif TaskType.BATCH == str(taskJob.type):
            jobTemplateList = TemplateDao.loadTemplateByTaskJobId({
                "taskJobId":
                taskJob.id,
                "action":
                "1"
            })
            if jobTemplateList.get("jobTemplateList") != None and len(
                    jobTemplateList.get("jobTemplateList")) > 0:
                for jobTemplate in jobTemplateList.get("jobTemplateList"):
                    jobTemplateId = jobTemplate.id
                    taskJobParam = crawlerbyKeyWord(taskJob, jobTemplate.id)
                    jobTemplateFieldList = TemplateDao.queryJobTemplateFieldByJobTemplateId(
                        jobTemplate.id)
                    LoggerDao.addTaskJobLogger(
                        taskJob,
                        LoggerDao.LoggerType.START,
                        jobTemplateId=jobTemplate.id,
                        taskJobHistoryId=taskJobHistory.id,
                        status=TaskStatus.RUNNING,
                        content=u"批量任务启动")
                    databaseId = jobTemplate.databaseId if jobTemplate.databaseId != "-1" and jobTemplate.databaseId != None else taskJob.databaseId
                    dbClient.getConnection(databaseId)
                    if dbClient == None:
                        TaskJobDao.updateTaskJobStatus(jobId, TaskStatus.FAIL)
                        LoggerDao.addTaskJobLogger(
                            taskJob,
                            LoggerDao.LoggerType.START,
                            jobTemplateId=jobTemplateId,
                            taskJobHistoryId=taskJobHistory.id,
                            content=u"no dbClient")
                        logging.error("no dbClient")
                        return
                    if not dbClient.isTableExist(jobTemplate.tableName):
                        dbClient.createTable(jobTemplate.id,
                                             jobTemplate.tableName,
                                             jobTemplateFieldList)
                    parseUrlAndInsertRedis(taskJob,
                                           taskJobParam=taskJobParam,
                                           taskJobHistory=taskJobHistory,
                                           jobTemplate=jobTemplate)
            else:
                TaskJobDao.updateTaskJobStatus(jobId, TaskStatus.FAIL)
                LoggerDao.addTaskJobLogger(taskJob,
                                           LoggerDao.LoggerType.START,
                                           jobTemplateId=jobTemplateId,
                                           taskJobHistoryId=taskJobHistory.id,
                                           content=u"no jobTemplate")
                logging.error("no jobTemplate")
        elif TaskType.DEPTH == str(taskJob.type):
            parseUrlAndInsertRedis(taskJob,
                                   taskJobParam=taskJobParam,
                                   taskJobHistory=taskJobHistory)

            # print mainId
            # if tempList:
            #     for temp in tempList:
            #         tempNode = hashConsistency.get_node(stringify(temp))
            #         nodePool.append(tempNode)
            #         RedisUtils.lpush(tempNode+"_"+ConfigUtils.getRedisPorperties(KEYMAP.ASSIST_SPIDER_REDIS_KEY), taskJobHistory.id)
            #         RedisUtils.lpush(tempNode+"_"+ConfigUtils.getRedisPorperties(KEYMAP.ASSIST_SPIDER_REDIS_KEY) + "_" + taskJobHistory.id,stringify(temp))
            #         RedisUtils.hset(ConfigUtils.getRedisPorperties(KEYMAP.FINISH_SPIDER_REDIS_KEY), temp.id, stringify(temp))
    except Exception, e:
        TaskJobDao.updateTaskJobStatus(jobId, TaskStatus.FAIL)
        LoggerDao.addTaskJobLogger(taskJob,
                                   LoggerDao.LoggerType.START,
                                   jobTemplateId=jobTemplateId,
                                   taskJobHistoryId=taskJobHistory.id,
                                   content=u"解析异常" + str(e))
        logging.error(repr(Exception))
Пример #7
0
def parseUrlAndInsertRedis(taskJob,
                           paramMap={},
                           taskJobParam=None,
                           taskJobHistory=None,
                           jobTemplate=None):
    if TaskType.DEPTH == str(taskJob.type):
        if bloomfilter_check(taskJob.id, taskJob.url):
            RedisUtils.lpush(
                ConfigUtils.getRedisPorperties(KEYMAP.DEPTH_SPIDER_REDIS_KEY),
                taskJobHistory.id)
            RedisUtils.lpush(
                ConfigUtils.getRedisPorperties(KEYMAP.DEPTH_SPIDER_REDIS_KEY) +
                "_" + taskJobHistory.id, stringify(taskJob))
    else:
        url = taskJob.url
        taskJobParamList = TaskJobDao.queryTaskJobParam(taskJob.id)
        if taskJobParam != None:
            if isinstance(taskJobParam, list):
                taskJobParamList.extend(taskJobParam)
            else:
                taskJobParamList.append(taskJobParam)
        jobTemplateParamList = TemplateDao.queryJobTemplateParamByJobTemplateId(
            jobTemplate.id)
        if jobTemplateParamList != None and len(jobTemplateParamList) > 0:
            taskJobParamList.extend(jobTemplateParamList)
        if taskJobHistory != None:
            jobTemplateParamTaskJob = JobTemplateParam(
                paramNameEn="task_job_id_sequence",
                paramValue=str(taskJobHistory.id))
            jobTemplateParamList.append(jobTemplateParamTaskJob)
        if taskJobParamList == None or len(taskJobParamList) <= 0:
            if str(taskJob.type) == TaskType.BATCH:
                url = jobTemplate.url
            renderUrl = RenderUtils.render(url, paramMap)

            # if bloomfilter_check(taskJob.id, renderUrl):
            newJobTemplate = ClassCopy.copyToNewInstances(
                jobTemplate, JobTemplate)
            taskJobHistoryId = taskJobHistory.id
            urlListStatus = UrlClazz(url=jobTemplate.url,
                                     parentUrl=paramMap.get("task_job_url"),
                                     jobTemplateId=jobTemplate.id,
                                     jobTemplateParentId=jobTemplate.parentId,
                                     taskJobId=taskJob.id,
                                     taskJobHistoryId=taskJobHistoryId)
            # try:
            #     request = urllib2.Request(
            #         url=url,
            #         headers={
            #             'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}
            #     )
            #     response = urllib2.urlopen(request)
            #     urldate = response.headers['date']
            # except Exception:
            #     pass
            #     print Exception
            setattr(newJobTemplate, "taskJobId", taskJob.id)
            setattr(newJobTemplate, "taskJobHistoryId", taskJobHistoryId)
            setattr(newJobTemplate, "url", renderUrl)
            setattr(newJobTemplate, "extraData", paramMap)
            # setattr(newJobTemplate, "urldate", urldate)
            setattr(newJobTemplate, "urlListStatusId", urlListStatus.id)
            LoggerDao.addTaskJobLogger(taskJob,
                                       LoggerDao.LoggerType.URL_TO_REDIS,
                                       jobTemplateId=newJobTemplate.id,
                                       taskJobHistoryId=taskJobHistoryId,
                                       content=u"redis_入库",
                                       url=renderUrl,
                                       status=TaskStatus.RUNNING)
            # if (hashswitch):
            #     tempList.append(stringify(newJobTemplate))
            # else:
            # mainId.append(stringify(newJobTemplate))
            RedisUtils.lpush(
                ConfigUtils.getRedisPorperties(KEYMAP.ASSIST_SPIDER_REDIS_KEY),
                taskJobHistoryId)
            RedisUtils.lpush(
                ConfigUtils.getRedisPorperties(KEYMAP.ASSIST_SPIDER_REDIS_KEY)
                + "_" + taskJobHistoryId, stringify(newJobTemplate))
            RedisUtils.hset(
                ConfigUtils.getRedisPorperties(KEYMAP.FINISH_SPIDER_REDIS_KEY),
                newJobTemplate.id, stringify(newJobTemplate))
            saveUrlListStatus(urlListStatus)
        else:
            for data in paraseJobTemplateList(taskJobParamList, paramMap):
                if str(taskJob.type) == TaskType.BATCH:
                    url = jobTemplate.url
                parentId = paramMap.get("dataParentId")
                paramMap = dict(paramMap.items() + data.items())
                renderUrl = RenderUtils.render(url, paramMap)
                # if bloomfilter_check(taskJob.id, renderUrl):
                newJobTemplate = ClassCopy.copyToNewInstances(
                    jobTemplate, JobTemplate)
                taskJobHistoryId = taskJobHistory.id
                urlListStatus = UrlClazz(
                    url=renderUrl,
                    parentUrl=paramMap.get("task_job_url"),
                    jobTemplateId=jobTemplate.id,
                    jobTemplateParentId=jobTemplate.parentId,
                    taskJobId=taskJob.id,
                    taskJobHistoryId=taskJobHistoryId)
                # try:
                #     request = urllib2.Request(
                #         url=url,
                #         headers={
                #             'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'
                #         }
                #     )
                #     response = urllib2.urlopen(request)
                #     urldate = response.headers['date']
                # except Exception:
                #     pass
                #     print Exception
                setattr(newJobTemplate, "taskJobId", taskJob.id)
                setattr(newJobTemplate, "taskJobHistoryId", taskJobHistoryId)
                setattr(newJobTemplate, "url", renderUrl)
                setattr(newJobTemplate, "dataParentId", parentId)
                setattr(newJobTemplate, "extraData", paramMap)
                # setattr(newJobTemplate, "urldate", urldate)
                setattr(newJobTemplate, "urlListStatusId", urlListStatus.id)
                LoggerDao.addTaskJobLogger(taskJob,
                                           LoggerDao.LoggerType.URL_TO_REDIS,
                                           jobTemplateId=newJobTemplate.id,
                                           taskJobHistoryId=taskJobHistoryId,
                                           content=u"redis_入库_多参数",
                                           url=renderUrl,
                                           status=TaskStatus.RUNNING)
                # if (hashswitch):
                #     tempList.append(newJobTemplate)
                # else:
                RedisUtils.lpush(
                    ConfigUtils.getRedisPorperties(
                        KEYMAP.ASSIST_SPIDER_REDIS_KEY), taskJobHistoryId)
                RedisUtils.lpush(
                    ConfigUtils.getRedisPorperties(
                        KEYMAP.ASSIST_SPIDER_REDIS_KEY) + "_" +
                    taskJobHistoryId, stringify(newJobTemplate))
                # mainId.append(stringify(newJobTemplate))
                RedisUtils.hset(
                    ConfigUtils.getRedisPorperties(
                        KEYMAP.FINISH_SPIDER_REDIS_KEY), newJobTemplate.id,
                    stringify(newJobTemplate))
                saveUrlListStatus(urlListStatus)