Пример #1
0
def contentDetail(taskJobId):
    taskJob = loadTaskById(taskJobId)
    dbSourse = queryDbSource(taskJob.databaseId)
    #兄弟任务
    brotherList = loadChildByParentId(taskJob.parentId)
    #子任务
    childrenList = loadChildByParentId(taskJob.id)
    #父任务
    parent = loadTaskById(taskJob.parentId)
    timerJob = loadTimerJob(taskJob.taskJobTimerId)
    jobTemplate = TemplateDao.queryJobTemplate(taskJob.jobTemplateId)
    jobTemplateList = TemplateDao.queryJobTemplateFieldByJobTemplateId(
        taskJob.jobTemplateId)
    taskReFieldList = loadAllTaskFieldById(taskJob.id)
    chooseMap = {}
    for taskJobReField in taskReFieldList:
        chooseMap[taskJobReField.jobTemplateFieldId] = True
    for jobTemplateField in jobTemplateList:
        jobTemplateField.chooseFlag = chooseMap.get(
            jobTemplateField.id) or False
    #日志接口(列表,下载接口)
    #模板勾选字段
    return {
        "result":
        TaskJobDetail(dbSourse, taskJob, brotherList, parent, timerJob,
                      jobTemplate, jobTemplateList, taskReFieldList,
                      childrenList)
    }
Пример #2
0
def exportJobTemplate(jobTemplateId):
    jobTemplate = queryJobTemplate(jobTemplateId)
    newJobTemplate = ClassCopy.copyToNewInstances(jobTemplate, JobTemplate)
    jobTemplateParamList = TemplateDao.queryJobTemplateParamByJobTemplateId(
        jobTemplateId)
    newJobTemplateParamList = []
    if jobTemplateParamList != None and len(jobTemplateParamList) > 0:
        for jobTemplateParam in jobTemplateParamList:
            newJobTemplateParamList.append(
                ClassCopy.copyToNewInstances(jobTemplateParam,
                                             JobTemplateParam))
    newJobTemplateFieldList = []
    jobTemplateFieldList = TemplateDao.queryJobTemplateFieldByJobTemplateId(
        jobTemplateId)  #(jobTemplateId)J
    if jobTemplateFieldList != None and len(jobTemplateFieldList) > 0:
        for jobTemplateField in jobTemplateFieldList:
            newJobTemplateFieldList.append(
                ClassCopy.copyToNewInstances(jobTemplateField,
                                             JobTemplateField))
    # jobTemplateSerialize = SeriesEntity.JobTemplateSerialize(newJobTemplate, newJobTemplateFieldList,newJobTemplateFieldList)
    jobTemplateSerialize = SeriesEntity.JobTemplateSerialize(
        newJobTemplate, newJobTemplateFieldList, newJobTemplateParamList)
    content = jobTemplateSerialize.serialize()
    response = make_response(content)
    response.headers[
        "Content-Disposition"] = "attachment; filename=template%s.data" % (
            jobTemplateId)
    return response
Пример #3
0
def parseUrlAndInsertRedis(jobTemplate,
                           paramMap={},
                           jobParamList=None,
                           taskJobHistory=None):
    url = jobTemplate.url
    jobTemplateId = jobTemplate.id
    taskJobHistoryId = None
    if taskJobHistory != None:
        taskJobHistoryId = taskJobHistory.id
    jobTemplateParamList = TemplateDao.queryJobTemplateParamByJobTemplateId(
        jobTemplateId)
    if jobTemplateParamList == None:
        jobTemplateParamList = []
    if jobTemplate != None and jobParamList != None:
        jobTemplateParamList.extend(jobParamList)
    if jobTemplateParamList == None or len(jobTemplateParamList) <= 0:
        renderUrl = RenderUtils.render(url, paramMap)
        newJobTemplate = ClassCopy.copyToNewInstances(jobTemplate, JobTemplate)
        newJobTemplate.url = renderUrl
        setattr(newJobTemplate, "taskJobHistoryId", taskJobHistoryId)
        # RedisUtils.lpush(ConfigUtils.getRedisPorperties(KEYMAP.TEMPLATE_SPIDER_NAME_REDIS_KEY), taskJobHistoryId)
        # RedisUtils.lpush(ConfigUtils.getRedisPorperties(KEYMAP.TEMPLATE_SPIDER_NAME_REDIS_KEY)+"_"+taskJobHistoryId, stringify(newJobTemplate))
    else:
        for data in paraseJobTemplateList(jobTemplateParamList, paramMap,
                                          jobTemplate):
            renderUrl = RenderUtils.render(
                url, dict(paramMap.items() + data.items()))
            newJobTemplate = ClassCopy.copyToNewInstances(
                jobTemplate, JobTemplate)
            newJobTemplate.url = renderUrl
            setattr(newJobTemplate, "taskJobHistoryId", taskJobHistoryId)
Пример #4
0
def relationTaskJob(jobTemplateId,taskJobId,fieldIds):
    TaskJobDao.delTaskJobReRelationByTaskJobId(taskJobId)
    if fieldIds!=None and str(fieldIds)=="-1":
        jobTemplateFieldList=TemplateDao.queryJobTemplateFieldByJobTemplateId(jobTemplateId)
        for field in jobTemplateFieldList:
            taskJobReField = TaskJobReField(id=uuid.uuid1())
            taskJobReField.jobTemplateId = jobTemplateId
            taskJobReField.taskJobId = taskJobId
            taskJobReField.delFlag = False
            taskJobReField.jobTemplateFieldId = field.id
            taskJobReField.createTime=datetime.now()
            Session.add(taskJobReField)
        return
    fieldList=fieldIds.split(",")
    for fieldId in fieldList:
        taskJobReField=TaskJobReField(id=uuid.uuid1())
        taskJobReField.jobTemplateId=jobTemplateId
        taskJobReField.taskJobId=taskJobId
        taskJobReField.delFlag=False
        taskJobReField.jobTemplateFieldId=fieldId
        taskJobReField.createTime=datetime.now()
        Session.add(taskJobReField)
    # TaskJobDao.updateTaskJob(TaskJob.id==taskJobId,{TaskJob.jobTemplateId:jobTemplateId})
    Session.query(TaskJob).filter(TaskJob.id==taskJobId).update({TaskJob.jobTemplateId:jobTemplateId})
    Session.flush()
    Session.commit()
Пример #5
0
def copyTask(taskJobId):
    taskJob = loadTaskById(taskJobId)
    taskJobReFieldList = loadTaskJobReFieldList(taskJobId)
    taskJobReTemplateList = TemplateDao.loadTaskJobReTemplate(taskJobId)
    taskJobNew = ClassCopy.copyToNewInstances(taskJob, TaskJob)
    id = uuid.uuid1()
    taskJobNew.id = id
    taskJobNew.createTime = datetime.now()
    taskJobNew.name = taskJob.name + "_副本"
    #状态置为0:未运行
    taskJobNew.status = 0
    Session.add(taskJobNew)
    flag = False
    for taskJobReField in taskJobReFieldList:
        taskJobReFieldNew = ClassCopy.copyToNewInstances(
            taskJobReField, TaskJobReField)
        taskJobReFieldNew.taskJobId = id
        taskJobReFieldNew.createTime = datetime.now()
        taskJobReFieldNew.id = uuid.uuid1()
        Session.add(taskJobReFieldNew)
    if (taskJobReTemplateList != None):
        for taskJobReTemplate in taskJobReTemplateList:
            taskJobReTemplate = ClassCopy.copyToNewInstances(
                taskJobReTemplate, TaskJobReTemplate)
            taskJobReTemplate.taskJobId = id
            taskJobReTemplate.createTime = datetime.now()
            taskJobReTemplate.id = uuid.uuid1()
            Session.add(taskJobReTemplate)
    flag = True
    return {"flag": flag, "newTaskId": str(id)}
Пример #6
0
 def getScrapBaseItem(self, taskJobId):
     taskJobHistoryId = self.params.get("taskJobHistoryId") or ""
     if CacheFactory.get("task_job", taskJobHistoryId) == None:
         taskJob = querTaskJob(taskJobId)
         scrapBaseItem = ScrapBaseItem()
         jobTemplateFieldList = queryFieldByTaskJobId(taskJobId)
         if jobTemplateFieldList == None or len(jobTemplateFieldList) == 0:
             jobTemplateFieldList = TemplateDao.queryJobTemplateFieldByJobTemplateId(
                 taskJob.jobTemplateId)
         jobTemplate = queryJobTemplate(taskJob.jobTemplateId)
         jobTemplateParamList = queryJobParam(taskJobId)
         taskJobParamList = TaskJobDao.queryTaskJobParam(taskJobId)
         scrapBaseItem["jobTemplateFieldList"] = jobTemplateFieldList
         scrapBaseItem["jobTemplate"] = jobTemplate
         scrapBaseItem["taskJobId"] = taskJobId
         scrapBaseItem["taskJob"] = taskJob
         setattr(taskJob, "taskJobHistoryId", taskJobHistoryId)
         scrapBaseItem["jobTemplateParamList"] = jobTemplateParamList
         scrapBaseItem["taskJobParamList"] = taskJobParamList
         CacheFactory.cache("task_job", taskJobHistoryId, scrapBaseItem)
         taskJobHistory = None
         if taskJobHistoryId != None and taskJobHistoryId != "" and CacheFactory.get(
                 "task_job_history", taskJobHistoryId) == None:
             taskJobHistory = TaskJobDao.loadTaskJobHistoryById(
                 taskJobHistoryId)
             CacheFactory.cache("task_job_history", taskJobHistoryId,
                                taskJobHistory)
         taskJobHistory = CacheFactory.get("task_job_history",
                                           taskJobHistoryId)
         scrapBaseItem["taskJobHistroy"] = taskJobHistory
     return CacheFactory.get("task_job",
                             taskJobHistoryId) or ScrapBaseItem()
Пример #7
0
def exportJobTemplateText(jobTemplateId):
    jobTemplate = queryJobTemplate(jobTemplateId)
    jobTemplateFieldList = TemplateDao.queryJobTemplateFieldByJobTemplateId(
        jobTemplateId)
    jobTemplateSerialize = SeriesEntity.JobTemplateSerialize(
        jobTemplate, jobTemplateFieldList)
    print jobTemplateSerialize.serialize()
    print SeriesEntity.JobTemplateSerialize.deSerialize(
        jobTemplateSerialize.serialize())  #.jobTemplate.id
    return
Пример #8
0
def importJobTemplate(jobTemplateId):
    jobTemplate = queryJobTemplate(jobTemplateId)
    jobTemplateFieldList = TemplateDao.queryJobTemplateFieldByJobTemplateId(
        jobTemplateId)
    jobTemplateSerialize = SeriesEntity.JobTemplateSerialize(
        jobTemplate, jobTemplateFieldList)
    response = make_response(jobTemplateSerialize)
    response.headers[
        "Content-Disposition"] = "attachment; filename=template%s.data" % (
            jobTemplateId)
    return response
Пример #9
0
def qrcode():
    params = loadParams()
    job_temp_id = params.get('job_template_id')
    field_id = params.get('field_id')
    field_name = params.get('field_name')
    job_template = TemplateDao.queryJobTemplate(job_temp_id)
    table_name = job_template.tableName
    ad_target_url = get_result(table_name, field_name, field_id)
    file_path = AdUtil.take_screenshot(ad_target_url)
    if file_path:
        return send_file(file_path, mimetype='image/png')
    else:
        return parseResponse(-1)
Пример #10
0
    def createTableByTaskJobId(self,
                               jobid,
                               tableName=None,
                               jobTemplateFieldList=None):
        """
                创建collection
                :param taskJobId:
                :return:
                """
        if tableName == None:
            taskJob = TaskJobDao.loadTaskById(jobid)

            tableName = taskJob.tableName

        # if self.isTableExist(tableName):
        #     logging.info('isTableExist:%s' % ('TRUE'))
        #     return
        if jobTemplateFieldList == None or len(jobTemplateFieldList) == 0:
            jobTemplateFieldList = TemplateDao.queryJobTemplateFieldByJobTemplateId(
                jobid)  # (jobid)
        if (jobTemplateFieldList == None or len(jobTemplateFieldList) == 0):
            return
        fieldList = []
        for jobTemplateField in jobTemplateFieldList:
            dataLength = jobTemplateField.dataLength
            dataType = jobTemplateField.dataType or "varchar"
            fieldNameEn = jobTemplateField.fieldNameEn
            if dataType == 'int':
                fieldList.append("`%s` %s" % (fieldNameEn, dataType))
            elif dataLength != None and dataLength > 0 or (
                    dataLength == None and dataType == "varchar"):
                if dataType != 'int':
                    dataLength = "1024"
                fieldList.append("`%s` %s(%s)" %
                                 (fieldNameEn, dataType, dataLength))
            else:
                fieldList.append("`%s` %s" % (fieldNameEn, dataType))
        fieldList.append("id ")
        fieldList.append("task_job_create_time")
        fieldList.append("task_job_del_flag ")
        fieldList.append("task_job_id_sequence")
        fieldList.append("parent_id ")
        fieldList.append("task_job_url ")
        fielddic = {}
        collection = self.db[tableName]
        for index, item in enumerate(fieldList):
            if item == 'task_job_create_time':
                fielddic[item] = time.strftime('%Y-%m-%d %H:%M:%S')
            else:
                fielddic[item] = ''
        collection.insert(fielddic)
Пример #11
0
    def createTableByTaskJobId(self,
                               taskJobId,
                               tableName=None,
                               jobTemplateFieldList=None):
        """
        创建数据库表
        :param taskJobId: 
        :return: 
        """
        if tableName == None:
            taskJob = TaskJobDao.loadTaskById(taskJobId)

            tableName = taskJob.tableName

        if self.isTableExist(tableName):
            logging.info('isTableExist:%s' % ('TRUE'))
            return
        if jobTemplateFieldList == None or len(jobTemplateFieldList) == 0:
            jobTemplateFieldList = TemplateDao.queryJobTemplateFieldByJobTemplateId(
                taskJobId)  #(jobid)
        if (jobTemplateFieldList == None or len(jobTemplateFieldList) == 0):
            return
        fieldList = []
        for jobTemplateField in jobTemplateFieldList:
            dataLength = jobTemplateField.dataLength
            dataType = str(jobTemplateField.dataType) or "varchar"
            fieldNameEn = str(jobTemplateField.fieldNameEn)
            if dataType == 'int':
                fieldList.append("%s %s" % (fieldNameEn, dataType))
            elif dataLength != None and dataLength > 0 or (
                    dataLength == None and dataType == "varchar"):
                if dataType != 'int':
                    dataLength = "1024"
                fieldList.append("%s %s(%s)" %
                                 (fieldNameEn, dataType, dataLength))
            else:
                fieldList.append("%s %s" % (fieldNameEn, dataType))
        fieldList.append("id varchar(50) primary key")
        fieldList.append("task_job_create_time datetime")
        fieldList.append("task_job_del_flag int")
        fieldList.append("task_job_id_sequence varchar(50)")
        fieldList.append("parent_id varchar(50)")
        fieldList.append("task_job_url varchar(255)")
        create_table_sql = "create table %s(%s)" % (tableName,
                                                    ",".join(fieldList))
        self.execute(create_table_sql)
Пример #12
0
    def createTableByTaskJobId(self,jobid,tableName=None,jobTemplateFieldList=None):
        """
                创建数据库表
                :param taskJobId:
                :return:
                """
        if tableName==None:
            taskJob = TaskJobDao.loadTaskById(jobid)

            tableName = taskJob.tableName

        # if self.isTableExist(tableName):
        #     logging.info('isTableExist:%s' % ('TRUE'))
        #     return
        if jobTemplateFieldList==None or len(jobTemplateFieldList)==0:
            jobTemplateFieldList = TemplateDao.queryJobTemplateFieldByJobTemplateId(jobid) #(jobid)
        if (jobTemplateFieldList == None or len(jobTemplateFieldList) == 0):
            return
        fieldList = []
        for jobTemplateField in jobTemplateFieldList:
            dataLength = jobTemplateField.dataLength
            dataType = jobTemplateField.dataType or "varchar"
            fieldNameEn = jobTemplateField.fieldNameEn
            if dataType=='int':
                fieldList.append("`%s` %s" % (fieldNameEn, dataType))
            elif dataLength != None and dataLength > 0 or (dataLength==None and dataType=="varchar"):
                if dataType!='int':
                    dataLength="1024"
                fieldList.append("`%s` %s(%s)" % (fieldNameEn, dataType, dataLength))
            else:
                fieldList.append("`%s` %s" % (fieldNameEn, dataType))
        fieldList.append("id varchar(50) primary key")
        fieldList.append("task_job_create_time datetime")
        fieldList.append("task_job_del_flag tinyint")
        fieldList.append("task_job_id_sequence varchar(50)")
        fieldList.append("parent_id varchar(50)")
        fieldList.append("task_job_url varchar(1024)")
        create_table_sql = "create table %s(%s)" % (tableName, ",".join(fieldList))
        self.cursor.execute(create_table_sql)
        #增加查询字段的索引来提高效率
        self.cursor.execute("alter table `%s` add index index_name(`parent_id`,`task_job_id_sequence`)"%(tableName))
Пример #13
0
    #             select * from %s where if('%s'!='' and '%s'!='-1',parent_id='%s',1=1)and if('%s'!='',task_job_id_sequence='%s',1=1) limit %s, %s
    #         """ % (tableName, parentId,parentId,parentId,taskHistoryId,taskHistoryId, st, pageCount)
    #     return self.execQuery(sql)


if __name__ == '__main__':
    # 创建表
    # SQLServerUtils().createTableByTaskJobId('1')
    cur_database_param = {
        "host": "10.128.100.203",
        "dbname": "demo",
        "username": "******",
        "password": "******"
    }
    sqlserve = sqlserver(cur_database_param)
    jobTemplateFieldList = TemplateDao.queryJobTemplateFieldByJobTemplateId(
        'b78f69bc-4c1a-11e7-ba08-000c299438c6')
    sqlserve.createTableByTaskJobId('lsl0s', 'kkkkkk00', jobTemplateFieldList)
    # print sqlserver
    import random

    number = random.randint(10, 100)
    d = {
        "pagecount": "116",
        "id": str(number),
        "task_job_id_sequence": 'e57f7640-6d1d-11e7-bc2b-38c986148389'
    }
    sql = sqlserve.insert('kkkkkk', d)
    sqlserve.execute(sql)
    # 插入数据
    # taskJob = TaskJob()
    # taskJob.tableName='taskJob_20170414000000'
Пример #14
0
    def _do_upinsert(self, item):
        now = str(datetime.now())
        data = item["data"]
        url = item["url"]
        jobTemplateFieldList = item["jobTemplateFieldList"]
        jobTemplate = item["jobTemplate"]
        self.dataParentId = jobTemplate.dataParentId if hasattr(
            jobTemplate, "dataParentId") else None
        extraData = jobTemplate.extraData
        self.taskJob = item["taskJob"]
        # searchTaskJob = item["searchTaskJob"]
        taskJobHistroy = item["taskJobHistroy"]
        self.taskJobHistoryId = jobTemplate.taskJobHistoryId
        taskJobHistroyId = str(taskJobHistroy.id)
        paramMap = {}
        self.taskJobParamList = []
        if taskJobHistroy != None:
            self.taskJobParamList.append(
                TaskJobParam(paramNameEn="task_job_id_sequence",
                             paramValue=taskJobHistroyId))
            paramMap["task_job_id_sequence"] = taskJobHistroyId
        # if searchTaskJob!=None:
        #     self.taskJobParamList.append(TaskJobParam(paramNameEn=searchTaskJob.name, paramValue=searchTaskJob.name))
        #     paramMap[searchTaskJob.name] = searchTaskJob.name
        # self.taskJobParamList = []
        # if self.taskJobHistoryId!=None:
        #     self.taskJobParamList=CacheFactory.get("task_job_param", self.taskJobHistoryId)
        # if self.taskJobParamList!=None:
        #     for taskJobParam in self.taskJobParamList:
        #         paramMap[taskJobParam.paramNameEn]=taskJobParam.paramValue
        tableName = jobTemplate.tableName
        jobTemplateId = jobTemplate.id
        databaseId = jobTemplate.databaseId if jobTemplate.databaseId != "-1" and jobTemplate.databaseId != None else self.taskJob.databaseId
        db = self.dbclient.getConnection(databaseId)

        if db == None:
            logging.warning('db is null,please check it with databaseid :%s' %
                            databaseId)
            if llen(
                    ConfigUtils.getRedisPorperties(
                        KEYMAP.MAIN_SPIDER_REDIS_KEY)) == 0:
                if self.taskJob.status != TaskStatus.SUCCESS:
                    TaskJobDao.updateTaskJobStatus(self.taskJob.id,
                                                   TaskStatus.SUCCESS)
                    UrlDao.updateUrlStatusListByTaskJobHistoryId(
                        self.taskJobHistoryId,
                        status=UrlStatus.STOP,
                        desc="no db")
            return
        sqlArray = []
        if data == None or len(data) == 0:
            logging.warning(
                'insert data not exist,please retry crawler or check template or check error'
            )
            if llen(
                    ConfigUtils.getRedisPorperties(
                        KEYMAP.MAIN_SPIDER_REDIS_KEY)) == 0:
                if self.taskJob.status != TaskStatus.SUCCESS:
                    TaskJobDao.updateTaskJobStatus(self.taskJob.id,
                                                   TaskStatus.SUCCESS)
                    UrlDao.updateUrlStatusListByTaskJobHistoryId(
                        self.taskJobHistoryId,
                        status=UrlStatus.STOP,
                        desc="no data")
            return
        logging.info('----pipelines insert data-----%s' % str(data))
        for d in data:
            d["task_job_url"] = url
            if self.dataParentId != None:
                d["parent_id"] = self.dataParentId
            d["id"] = str(uuid.uuid1())
            if self.dbclient.db_type == 'kafka':
                d['TemplateName'] = jobTemplate.name
                d['UrlStatus'] = 0
                d['Timestamps'] = int(time.time())
            if self.dbclient.db_type == 'hdfs' or self.dbclient.db_type == 'mongodb':
                sqlArray.append(
                    db.insert(jobTemplate.id, tableName, d, paramMap))
            else:
                sqlArray.append(db.insert(tableName, d, paramMap))
            if jobTemplateId != None:
                try:
                    childJobTemplateList = TemplateDao.queryJobTemplateListByParentId(
                        jobTemplateId)
                    self.loadNext(childJobTemplateList,
                                  dict(extraData.items() + d.items()))
                except Exception, e:
                    logging.error(e.message)
Пример #15
0
def startCrawlerByTaskJobId(jobId, taskJobParam=None):
    logging.info('------startCrawlerByTaskJobId-------%s' % jobId)
    taskJobHistory = TaskJobDao.addTaskJobHistroy(jobId,
                                                  TaskJobHistoryType.SINGLE)
    taskJob = query(TaskJob, text('id="' + str(jobId) + '"'), type=0)
    TaskJobDao.updateTaskJobStatus(jobId, TaskStatus.RUNNING)
    # tableName = jobTemplate.tableName
    jobTemplateId = None
    dbClient = DbClient()
    LoggerDao.addTaskJobLogger(taskJob,
                               LoggerDao.LoggerType.START,
                               taskJobHistoryId=taskJobHistory.id,
                               status=TaskStatus.RUNNING,
                               content=u"任务启动")
    try:
        if TaskType.SINGLE == str(taskJob.type):
            jobTemplate = TemplateDao.queryJobTemplate(taskJob.jobTemplateId)
            if jobTemplate == None:
                TaskJobDao.updateTaskJobStatus(jobId, TaskStatus.FAIL)
                LoggerDao.addTaskJobLogger(taskJob,
                                           LoggerDao.LoggerType.START,
                                           jobTemplateId=jobTemplateId,
                                           taskJobHistoryId=taskJobHistory.id,
                                           content=u"no jobTemplate")
                logging.error("no jobTemplate")
                return
            jobTemplateId = jobTemplate.id
            taskJobParam = crawlerbyKeyWord(taskJob, jobTemplate.id)
            jobTemplateFieldList = TemplateDao.queryJobTemplateFieldByJobTemplateId(
                jobTemplate.id)
            dbClient.getConnection(taskJob.databaseId
                                   or jobTemplate.databaseId)
            if not dbClient.isTableExist(jobTemplate.tableName):
                dbClient.createTable(jobTemplate.id, jobTemplate.tableName,
                                     jobTemplateFieldList)
            setattr(jobTemplate, "url", taskJob.url)
            setattr(jobTemplate, "tableName", taskJob.tableName)
            LoggerDao.addTaskJobLogger(taskJob,
                                       LoggerDao.LoggerType.START,
                                       jobTemplateId=jobTemplate.id,
                                       taskJobHistoryId=taskJobHistory.id,
                                       status=TaskStatus.RUNNING,
                                       content=u"定向任务任务启动")
            parseUrlAndInsertRedis(taskJob,
                                   taskJobParam=taskJobParam,
                                   taskJobHistory=taskJobHistory,
                                   jobTemplate=jobTemplate)
        elif TaskType.BATCH == str(taskJob.type):
            jobTemplateList = TemplateDao.loadTemplateByTaskJobId({
                "taskJobId":
                taskJob.id,
                "action":
                "1"
            })
            if jobTemplateList.get("jobTemplateList") != None and len(
                    jobTemplateList.get("jobTemplateList")) > 0:
                for jobTemplate in jobTemplateList.get("jobTemplateList"):
                    jobTemplateId = jobTemplate.id
                    taskJobParam = crawlerbyKeyWord(taskJob, jobTemplate.id)
                    jobTemplateFieldList = TemplateDao.queryJobTemplateFieldByJobTemplateId(
                        jobTemplate.id)
                    LoggerDao.addTaskJobLogger(
                        taskJob,
                        LoggerDao.LoggerType.START,
                        jobTemplateId=jobTemplate.id,
                        taskJobHistoryId=taskJobHistory.id,
                        status=TaskStatus.RUNNING,
                        content=u"批量任务启动")
                    databaseId = jobTemplate.databaseId if jobTemplate.databaseId != "-1" and jobTemplate.databaseId != None else taskJob.databaseId
                    dbClient.getConnection(databaseId)
                    if dbClient == None:
                        TaskJobDao.updateTaskJobStatus(jobId, TaskStatus.FAIL)
                        LoggerDao.addTaskJobLogger(
                            taskJob,
                            LoggerDao.LoggerType.START,
                            jobTemplateId=jobTemplateId,
                            taskJobHistoryId=taskJobHistory.id,
                            content=u"no dbClient")
                        logging.error("no dbClient")
                        return
                    if not dbClient.isTableExist(jobTemplate.tableName):
                        dbClient.createTable(jobTemplate.id,
                                             jobTemplate.tableName,
                                             jobTemplateFieldList)
                    parseUrlAndInsertRedis(taskJob,
                                           taskJobParam=taskJobParam,
                                           taskJobHistory=taskJobHistory,
                                           jobTemplate=jobTemplate)
            else:
                TaskJobDao.updateTaskJobStatus(jobId, TaskStatus.FAIL)
                LoggerDao.addTaskJobLogger(taskJob,
                                           LoggerDao.LoggerType.START,
                                           jobTemplateId=jobTemplateId,
                                           taskJobHistoryId=taskJobHistory.id,
                                           content=u"no jobTemplate")
                logging.error("no jobTemplate")
        elif TaskType.DEPTH == str(taskJob.type):
            parseUrlAndInsertRedis(taskJob,
                                   taskJobParam=taskJobParam,
                                   taskJobHistory=taskJobHistory)

            # print mainId
            # if tempList:
            #     for temp in tempList:
            #         tempNode = hashConsistency.get_node(stringify(temp))
            #         nodePool.append(tempNode)
            #         RedisUtils.lpush(tempNode+"_"+ConfigUtils.getRedisPorperties(KEYMAP.ASSIST_SPIDER_REDIS_KEY), taskJobHistory.id)
            #         RedisUtils.lpush(tempNode+"_"+ConfigUtils.getRedisPorperties(KEYMAP.ASSIST_SPIDER_REDIS_KEY) + "_" + taskJobHistory.id,stringify(temp))
            #         RedisUtils.hset(ConfigUtils.getRedisPorperties(KEYMAP.FINISH_SPIDER_REDIS_KEY), temp.id, stringify(temp))
    except Exception, e:
        TaskJobDao.updateTaskJobStatus(jobId, TaskStatus.FAIL)
        LoggerDao.addTaskJobLogger(taskJob,
                                   LoggerDao.LoggerType.START,
                                   jobTemplateId=jobTemplateId,
                                   taskJobHistoryId=taskJobHistory.id,
                                   content=u"解析异常" + str(e))
        logging.error(repr(Exception))
Пример #16
0
def parseUrlAndInsertRedis(taskJob,
                           paramMap={},
                           taskJobParam=None,
                           taskJobHistory=None,
                           jobTemplate=None):
    if TaskType.DEPTH == str(taskJob.type):
        if bloomfilter_check(taskJob.id, taskJob.url):
            RedisUtils.lpush(
                ConfigUtils.getRedisPorperties(KEYMAP.DEPTH_SPIDER_REDIS_KEY),
                taskJobHistory.id)
            RedisUtils.lpush(
                ConfigUtils.getRedisPorperties(KEYMAP.DEPTH_SPIDER_REDIS_KEY) +
                "_" + taskJobHistory.id, stringify(taskJob))
    else:
        url = taskJob.url
        taskJobParamList = TaskJobDao.queryTaskJobParam(taskJob.id)
        if taskJobParam != None:
            if isinstance(taskJobParam, list):
                taskJobParamList.extend(taskJobParam)
            else:
                taskJobParamList.append(taskJobParam)
        jobTemplateParamList = TemplateDao.queryJobTemplateParamByJobTemplateId(
            jobTemplate.id)
        if jobTemplateParamList != None and len(jobTemplateParamList) > 0:
            taskJobParamList.extend(jobTemplateParamList)
        if taskJobHistory != None:
            jobTemplateParamTaskJob = JobTemplateParam(
                paramNameEn="task_job_id_sequence",
                paramValue=str(taskJobHistory.id))
            jobTemplateParamList.append(jobTemplateParamTaskJob)
        if taskJobParamList == None or len(taskJobParamList) <= 0:
            if str(taskJob.type) == TaskType.BATCH:
                url = jobTemplate.url
            renderUrl = RenderUtils.render(url, paramMap)

            # if bloomfilter_check(taskJob.id, renderUrl):
            newJobTemplate = ClassCopy.copyToNewInstances(
                jobTemplate, JobTemplate)
            taskJobHistoryId = taskJobHistory.id
            urlListStatus = UrlClazz(url=jobTemplate.url,
                                     parentUrl=paramMap.get("task_job_url"),
                                     jobTemplateId=jobTemplate.id,
                                     jobTemplateParentId=jobTemplate.parentId,
                                     taskJobId=taskJob.id,
                                     taskJobHistoryId=taskJobHistoryId)
            # try:
            #     request = urllib2.Request(
            #         url=url,
            #         headers={
            #             'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}
            #     )
            #     response = urllib2.urlopen(request)
            #     urldate = response.headers['date']
            # except Exception:
            #     pass
            #     print Exception
            setattr(newJobTemplate, "taskJobId", taskJob.id)
            setattr(newJobTemplate, "taskJobHistoryId", taskJobHistoryId)
            setattr(newJobTemplate, "url", renderUrl)
            setattr(newJobTemplate, "extraData", paramMap)
            # setattr(newJobTemplate, "urldate", urldate)
            setattr(newJobTemplate, "urlListStatusId", urlListStatus.id)
            LoggerDao.addTaskJobLogger(taskJob,
                                       LoggerDao.LoggerType.URL_TO_REDIS,
                                       jobTemplateId=newJobTemplate.id,
                                       taskJobHistoryId=taskJobHistoryId,
                                       content=u"redis_入库",
                                       url=renderUrl,
                                       status=TaskStatus.RUNNING)
            # if (hashswitch):
            #     tempList.append(stringify(newJobTemplate))
            # else:
            # mainId.append(stringify(newJobTemplate))
            RedisUtils.lpush(
                ConfigUtils.getRedisPorperties(KEYMAP.ASSIST_SPIDER_REDIS_KEY),
                taskJobHistoryId)
            RedisUtils.lpush(
                ConfigUtils.getRedisPorperties(KEYMAP.ASSIST_SPIDER_REDIS_KEY)
                + "_" + taskJobHistoryId, stringify(newJobTemplate))
            RedisUtils.hset(
                ConfigUtils.getRedisPorperties(KEYMAP.FINISH_SPIDER_REDIS_KEY),
                newJobTemplate.id, stringify(newJobTemplate))
            saveUrlListStatus(urlListStatus)
        else:
            for data in paraseJobTemplateList(taskJobParamList, paramMap):
                if str(taskJob.type) == TaskType.BATCH:
                    url = jobTemplate.url
                parentId = paramMap.get("dataParentId")
                paramMap = dict(paramMap.items() + data.items())
                renderUrl = RenderUtils.render(url, paramMap)
                # if bloomfilter_check(taskJob.id, renderUrl):
                newJobTemplate = ClassCopy.copyToNewInstances(
                    jobTemplate, JobTemplate)
                taskJobHistoryId = taskJobHistory.id
                urlListStatus = UrlClazz(
                    url=renderUrl,
                    parentUrl=paramMap.get("task_job_url"),
                    jobTemplateId=jobTemplate.id,
                    jobTemplateParentId=jobTemplate.parentId,
                    taskJobId=taskJob.id,
                    taskJobHistoryId=taskJobHistoryId)
                # try:
                #     request = urllib2.Request(
                #         url=url,
                #         headers={
                #             'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'
                #         }
                #     )
                #     response = urllib2.urlopen(request)
                #     urldate = response.headers['date']
                # except Exception:
                #     pass
                #     print Exception
                setattr(newJobTemplate, "taskJobId", taskJob.id)
                setattr(newJobTemplate, "taskJobHistoryId", taskJobHistoryId)
                setattr(newJobTemplate, "url", renderUrl)
                setattr(newJobTemplate, "dataParentId", parentId)
                setattr(newJobTemplate, "extraData", paramMap)
                # setattr(newJobTemplate, "urldate", urldate)
                setattr(newJobTemplate, "urlListStatusId", urlListStatus.id)
                LoggerDao.addTaskJobLogger(taskJob,
                                           LoggerDao.LoggerType.URL_TO_REDIS,
                                           jobTemplateId=newJobTemplate.id,
                                           taskJobHistoryId=taskJobHistoryId,
                                           content=u"redis_入库_多参数",
                                           url=renderUrl,
                                           status=TaskStatus.RUNNING)
                # if (hashswitch):
                #     tempList.append(newJobTemplate)
                # else:
                RedisUtils.lpush(
                    ConfigUtils.getRedisPorperties(
                        KEYMAP.ASSIST_SPIDER_REDIS_KEY), taskJobHistoryId)
                RedisUtils.lpush(
                    ConfigUtils.getRedisPorperties(
                        KEYMAP.ASSIST_SPIDER_REDIS_KEY) + "_" +
                    taskJobHistoryId, stringify(newJobTemplate))
                # mainId.append(stringify(newJobTemplate))
                RedisUtils.hset(
                    ConfigUtils.getRedisPorperties(
                        KEYMAP.FINISH_SPIDER_REDIS_KEY), newJobTemplate.id,
                    stringify(newJobTemplate))
                saveUrlListStatus(urlListStatus)
Пример #17
0
def startCrawlerByTemplateId(templateId, jobTemplateParamList, taskJobHistory):
    jobTemplate = TemplateDao.queryJobTemplate(templateId)
    if jobTemplate == None:
        return
    parseUrlAndInsertRedis(jobTemplate, {}, jobTemplateParamList,
                           taskJobHistory)