def contentDetail(taskJobId): taskJob = loadTaskById(taskJobId) dbSourse = queryDbSource(taskJob.databaseId) #兄弟任务 brotherList = loadChildByParentId(taskJob.parentId) #子任务 childrenList = loadChildByParentId(taskJob.id) #父任务 parent = loadTaskById(taskJob.parentId) timerJob = loadTimerJob(taskJob.taskJobTimerId) jobTemplate = TemplateDao.queryJobTemplate(taskJob.jobTemplateId) jobTemplateList = TemplateDao.queryJobTemplateFieldByJobTemplateId( taskJob.jobTemplateId) taskReFieldList = loadAllTaskFieldById(taskJob.id) chooseMap = {} for taskJobReField in taskReFieldList: chooseMap[taskJobReField.jobTemplateFieldId] = True for jobTemplateField in jobTemplateList: jobTemplateField.chooseFlag = chooseMap.get( jobTemplateField.id) or False #日志接口(列表,下载接口) #模板勾选字段 return { "result": TaskJobDetail(dbSourse, taskJob, brotherList, parent, timerJob, jobTemplate, jobTemplateList, taskReFieldList, childrenList) }
def exportJobTemplate(jobTemplateId): jobTemplate = queryJobTemplate(jobTemplateId) newJobTemplate = ClassCopy.copyToNewInstances(jobTemplate, JobTemplate) jobTemplateParamList = TemplateDao.queryJobTemplateParamByJobTemplateId( jobTemplateId) newJobTemplateParamList = [] if jobTemplateParamList != None and len(jobTemplateParamList) > 0: for jobTemplateParam in jobTemplateParamList: newJobTemplateParamList.append( ClassCopy.copyToNewInstances(jobTemplateParam, JobTemplateParam)) newJobTemplateFieldList = [] jobTemplateFieldList = TemplateDao.queryJobTemplateFieldByJobTemplateId( jobTemplateId) #(jobTemplateId)J if jobTemplateFieldList != None and len(jobTemplateFieldList) > 0: for jobTemplateField in jobTemplateFieldList: newJobTemplateFieldList.append( ClassCopy.copyToNewInstances(jobTemplateField, JobTemplateField)) # jobTemplateSerialize = SeriesEntity.JobTemplateSerialize(newJobTemplate, newJobTemplateFieldList,newJobTemplateFieldList) jobTemplateSerialize = SeriesEntity.JobTemplateSerialize( newJobTemplate, newJobTemplateFieldList, newJobTemplateParamList) content = jobTemplateSerialize.serialize() response = make_response(content) response.headers[ "Content-Disposition"] = "attachment; filename=template%s.data" % ( jobTemplateId) return response
def parseUrlAndInsertRedis(jobTemplate, paramMap={}, jobParamList=None, taskJobHistory=None): url = jobTemplate.url jobTemplateId = jobTemplate.id taskJobHistoryId = None if taskJobHistory != None: taskJobHistoryId = taskJobHistory.id jobTemplateParamList = TemplateDao.queryJobTemplateParamByJobTemplateId( jobTemplateId) if jobTemplateParamList == None: jobTemplateParamList = [] if jobTemplate != None and jobParamList != None: jobTemplateParamList.extend(jobParamList) if jobTemplateParamList == None or len(jobTemplateParamList) <= 0: renderUrl = RenderUtils.render(url, paramMap) newJobTemplate = ClassCopy.copyToNewInstances(jobTemplate, JobTemplate) newJobTemplate.url = renderUrl setattr(newJobTemplate, "taskJobHistoryId", taskJobHistoryId) # RedisUtils.lpush(ConfigUtils.getRedisPorperties(KEYMAP.TEMPLATE_SPIDER_NAME_REDIS_KEY), taskJobHistoryId) # RedisUtils.lpush(ConfigUtils.getRedisPorperties(KEYMAP.TEMPLATE_SPIDER_NAME_REDIS_KEY)+"_"+taskJobHistoryId, stringify(newJobTemplate)) else: for data in paraseJobTemplateList(jobTemplateParamList, paramMap, jobTemplate): renderUrl = RenderUtils.render( url, dict(paramMap.items() + data.items())) newJobTemplate = ClassCopy.copyToNewInstances( jobTemplate, JobTemplate) newJobTemplate.url = renderUrl setattr(newJobTemplate, "taskJobHistoryId", taskJobHistoryId)
def relationTaskJob(jobTemplateId,taskJobId,fieldIds): TaskJobDao.delTaskJobReRelationByTaskJobId(taskJobId) if fieldIds!=None and str(fieldIds)=="-1": jobTemplateFieldList=TemplateDao.queryJobTemplateFieldByJobTemplateId(jobTemplateId) for field in jobTemplateFieldList: taskJobReField = TaskJobReField(id=uuid.uuid1()) taskJobReField.jobTemplateId = jobTemplateId taskJobReField.taskJobId = taskJobId taskJobReField.delFlag = False taskJobReField.jobTemplateFieldId = field.id taskJobReField.createTime=datetime.now() Session.add(taskJobReField) return fieldList=fieldIds.split(",") for fieldId in fieldList: taskJobReField=TaskJobReField(id=uuid.uuid1()) taskJobReField.jobTemplateId=jobTemplateId taskJobReField.taskJobId=taskJobId taskJobReField.delFlag=False taskJobReField.jobTemplateFieldId=fieldId taskJobReField.createTime=datetime.now() Session.add(taskJobReField) # TaskJobDao.updateTaskJob(TaskJob.id==taskJobId,{TaskJob.jobTemplateId:jobTemplateId}) Session.query(TaskJob).filter(TaskJob.id==taskJobId).update({TaskJob.jobTemplateId:jobTemplateId}) Session.flush() Session.commit()
def copyTask(taskJobId): taskJob = loadTaskById(taskJobId) taskJobReFieldList = loadTaskJobReFieldList(taskJobId) taskJobReTemplateList = TemplateDao.loadTaskJobReTemplate(taskJobId) taskJobNew = ClassCopy.copyToNewInstances(taskJob, TaskJob) id = uuid.uuid1() taskJobNew.id = id taskJobNew.createTime = datetime.now() taskJobNew.name = taskJob.name + "_副本" #状态置为0:未运行 taskJobNew.status = 0 Session.add(taskJobNew) flag = False for taskJobReField in taskJobReFieldList: taskJobReFieldNew = ClassCopy.copyToNewInstances( taskJobReField, TaskJobReField) taskJobReFieldNew.taskJobId = id taskJobReFieldNew.createTime = datetime.now() taskJobReFieldNew.id = uuid.uuid1() Session.add(taskJobReFieldNew) if (taskJobReTemplateList != None): for taskJobReTemplate in taskJobReTemplateList: taskJobReTemplate = ClassCopy.copyToNewInstances( taskJobReTemplate, TaskJobReTemplate) taskJobReTemplate.taskJobId = id taskJobReTemplate.createTime = datetime.now() taskJobReTemplate.id = uuid.uuid1() Session.add(taskJobReTemplate) flag = True return {"flag": flag, "newTaskId": str(id)}
def getScrapBaseItem(self, taskJobId): taskJobHistoryId = self.params.get("taskJobHistoryId") or "" if CacheFactory.get("task_job", taskJobHistoryId) == None: taskJob = querTaskJob(taskJobId) scrapBaseItem = ScrapBaseItem() jobTemplateFieldList = queryFieldByTaskJobId(taskJobId) if jobTemplateFieldList == None or len(jobTemplateFieldList) == 0: jobTemplateFieldList = TemplateDao.queryJobTemplateFieldByJobTemplateId( taskJob.jobTemplateId) jobTemplate = queryJobTemplate(taskJob.jobTemplateId) jobTemplateParamList = queryJobParam(taskJobId) taskJobParamList = TaskJobDao.queryTaskJobParam(taskJobId) scrapBaseItem["jobTemplateFieldList"] = jobTemplateFieldList scrapBaseItem["jobTemplate"] = jobTemplate scrapBaseItem["taskJobId"] = taskJobId scrapBaseItem["taskJob"] = taskJob setattr(taskJob, "taskJobHistoryId", taskJobHistoryId) scrapBaseItem["jobTemplateParamList"] = jobTemplateParamList scrapBaseItem["taskJobParamList"] = taskJobParamList CacheFactory.cache("task_job", taskJobHistoryId, scrapBaseItem) taskJobHistory = None if taskJobHistoryId != None and taskJobHistoryId != "" and CacheFactory.get( "task_job_history", taskJobHistoryId) == None: taskJobHistory = TaskJobDao.loadTaskJobHistoryById( taskJobHistoryId) CacheFactory.cache("task_job_history", taskJobHistoryId, taskJobHistory) taskJobHistory = CacheFactory.get("task_job_history", taskJobHistoryId) scrapBaseItem["taskJobHistroy"] = taskJobHistory return CacheFactory.get("task_job", taskJobHistoryId) or ScrapBaseItem()
def exportJobTemplateText(jobTemplateId): jobTemplate = queryJobTemplate(jobTemplateId) jobTemplateFieldList = TemplateDao.queryJobTemplateFieldByJobTemplateId( jobTemplateId) jobTemplateSerialize = SeriesEntity.JobTemplateSerialize( jobTemplate, jobTemplateFieldList) print jobTemplateSerialize.serialize() print SeriesEntity.JobTemplateSerialize.deSerialize( jobTemplateSerialize.serialize()) #.jobTemplate.id return
def importJobTemplate(jobTemplateId): jobTemplate = queryJobTemplate(jobTemplateId) jobTemplateFieldList = TemplateDao.queryJobTemplateFieldByJobTemplateId( jobTemplateId) jobTemplateSerialize = SeriesEntity.JobTemplateSerialize( jobTemplate, jobTemplateFieldList) response = make_response(jobTemplateSerialize) response.headers[ "Content-Disposition"] = "attachment; filename=template%s.data" % ( jobTemplateId) return response
def qrcode(): params = loadParams() job_temp_id = params.get('job_template_id') field_id = params.get('field_id') field_name = params.get('field_name') job_template = TemplateDao.queryJobTemplate(job_temp_id) table_name = job_template.tableName ad_target_url = get_result(table_name, field_name, field_id) file_path = AdUtil.take_screenshot(ad_target_url) if file_path: return send_file(file_path, mimetype='image/png') else: return parseResponse(-1)
def createTableByTaskJobId(self, jobid, tableName=None, jobTemplateFieldList=None): """ 创建collection :param taskJobId: :return: """ if tableName == None: taskJob = TaskJobDao.loadTaskById(jobid) tableName = taskJob.tableName # if self.isTableExist(tableName): # logging.info('isTableExist:%s' % ('TRUE')) # return if jobTemplateFieldList == None or len(jobTemplateFieldList) == 0: jobTemplateFieldList = TemplateDao.queryJobTemplateFieldByJobTemplateId( jobid) # (jobid) if (jobTemplateFieldList == None or len(jobTemplateFieldList) == 0): return fieldList = [] for jobTemplateField in jobTemplateFieldList: dataLength = jobTemplateField.dataLength dataType = jobTemplateField.dataType or "varchar" fieldNameEn = jobTemplateField.fieldNameEn if dataType == 'int': fieldList.append("`%s` %s" % (fieldNameEn, dataType)) elif dataLength != None and dataLength > 0 or ( dataLength == None and dataType == "varchar"): if dataType != 'int': dataLength = "1024" fieldList.append("`%s` %s(%s)" % (fieldNameEn, dataType, dataLength)) else: fieldList.append("`%s` %s" % (fieldNameEn, dataType)) fieldList.append("id ") fieldList.append("task_job_create_time") fieldList.append("task_job_del_flag ") fieldList.append("task_job_id_sequence") fieldList.append("parent_id ") fieldList.append("task_job_url ") fielddic = {} collection = self.db[tableName] for index, item in enumerate(fieldList): if item == 'task_job_create_time': fielddic[item] = time.strftime('%Y-%m-%d %H:%M:%S') else: fielddic[item] = '' collection.insert(fielddic)
def createTableByTaskJobId(self, taskJobId, tableName=None, jobTemplateFieldList=None): """ 创建数据库表 :param taskJobId: :return: """ if tableName == None: taskJob = TaskJobDao.loadTaskById(taskJobId) tableName = taskJob.tableName if self.isTableExist(tableName): logging.info('isTableExist:%s' % ('TRUE')) return if jobTemplateFieldList == None or len(jobTemplateFieldList) == 0: jobTemplateFieldList = TemplateDao.queryJobTemplateFieldByJobTemplateId( taskJobId) #(jobid) if (jobTemplateFieldList == None or len(jobTemplateFieldList) == 0): return fieldList = [] for jobTemplateField in jobTemplateFieldList: dataLength = jobTemplateField.dataLength dataType = str(jobTemplateField.dataType) or "varchar" fieldNameEn = str(jobTemplateField.fieldNameEn) if dataType == 'int': fieldList.append("%s %s" % (fieldNameEn, dataType)) elif dataLength != None and dataLength > 0 or ( dataLength == None and dataType == "varchar"): if dataType != 'int': dataLength = "1024" fieldList.append("%s %s(%s)" % (fieldNameEn, dataType, dataLength)) else: fieldList.append("%s %s" % (fieldNameEn, dataType)) fieldList.append("id varchar(50) primary key") fieldList.append("task_job_create_time datetime") fieldList.append("task_job_del_flag int") fieldList.append("task_job_id_sequence varchar(50)") fieldList.append("parent_id varchar(50)") fieldList.append("task_job_url varchar(255)") create_table_sql = "create table %s(%s)" % (tableName, ",".join(fieldList)) self.execute(create_table_sql)
def createTableByTaskJobId(self,jobid,tableName=None,jobTemplateFieldList=None): """ 创建数据库表 :param taskJobId: :return: """ if tableName==None: taskJob = TaskJobDao.loadTaskById(jobid) tableName = taskJob.tableName # if self.isTableExist(tableName): # logging.info('isTableExist:%s' % ('TRUE')) # return if jobTemplateFieldList==None or len(jobTemplateFieldList)==0: jobTemplateFieldList = TemplateDao.queryJobTemplateFieldByJobTemplateId(jobid) #(jobid) if (jobTemplateFieldList == None or len(jobTemplateFieldList) == 0): return fieldList = [] for jobTemplateField in jobTemplateFieldList: dataLength = jobTemplateField.dataLength dataType = jobTemplateField.dataType or "varchar" fieldNameEn = jobTemplateField.fieldNameEn if dataType=='int': fieldList.append("`%s` %s" % (fieldNameEn, dataType)) elif dataLength != None and dataLength > 0 or (dataLength==None and dataType=="varchar"): if dataType!='int': dataLength="1024" fieldList.append("`%s` %s(%s)" % (fieldNameEn, dataType, dataLength)) else: fieldList.append("`%s` %s" % (fieldNameEn, dataType)) fieldList.append("id varchar(50) primary key") fieldList.append("task_job_create_time datetime") fieldList.append("task_job_del_flag tinyint") fieldList.append("task_job_id_sequence varchar(50)") fieldList.append("parent_id varchar(50)") fieldList.append("task_job_url varchar(1024)") create_table_sql = "create table %s(%s)" % (tableName, ",".join(fieldList)) self.cursor.execute(create_table_sql) #增加查询字段的索引来提高效率 self.cursor.execute("alter table `%s` add index index_name(`parent_id`,`task_job_id_sequence`)"%(tableName))
# select * from %s where if('%s'!='' and '%s'!='-1',parent_id='%s',1=1)and if('%s'!='',task_job_id_sequence='%s',1=1) limit %s, %s # """ % (tableName, parentId,parentId,parentId,taskHistoryId,taskHistoryId, st, pageCount) # return self.execQuery(sql) if __name__ == '__main__': # 创建表 # SQLServerUtils().createTableByTaskJobId('1') cur_database_param = { "host": "10.128.100.203", "dbname": "demo", "username": "******", "password": "******" } sqlserve = sqlserver(cur_database_param) jobTemplateFieldList = TemplateDao.queryJobTemplateFieldByJobTemplateId( 'b78f69bc-4c1a-11e7-ba08-000c299438c6') sqlserve.createTableByTaskJobId('lsl0s', 'kkkkkk00', jobTemplateFieldList) # print sqlserver import random number = random.randint(10, 100) d = { "pagecount": "116", "id": str(number), "task_job_id_sequence": 'e57f7640-6d1d-11e7-bc2b-38c986148389' } sql = sqlserve.insert('kkkkkk', d) sqlserve.execute(sql) # 插入数据 # taskJob = TaskJob() # taskJob.tableName='taskJob_20170414000000'
def _do_upinsert(self, item): now = str(datetime.now()) data = item["data"] url = item["url"] jobTemplateFieldList = item["jobTemplateFieldList"] jobTemplate = item["jobTemplate"] self.dataParentId = jobTemplate.dataParentId if hasattr( jobTemplate, "dataParentId") else None extraData = jobTemplate.extraData self.taskJob = item["taskJob"] # searchTaskJob = item["searchTaskJob"] taskJobHistroy = item["taskJobHistroy"] self.taskJobHistoryId = jobTemplate.taskJobHistoryId taskJobHistroyId = str(taskJobHistroy.id) paramMap = {} self.taskJobParamList = [] if taskJobHistroy != None: self.taskJobParamList.append( TaskJobParam(paramNameEn="task_job_id_sequence", paramValue=taskJobHistroyId)) paramMap["task_job_id_sequence"] = taskJobHistroyId # if searchTaskJob!=None: # self.taskJobParamList.append(TaskJobParam(paramNameEn=searchTaskJob.name, paramValue=searchTaskJob.name)) # paramMap[searchTaskJob.name] = searchTaskJob.name # self.taskJobParamList = [] # if self.taskJobHistoryId!=None: # self.taskJobParamList=CacheFactory.get("task_job_param", self.taskJobHistoryId) # if self.taskJobParamList!=None: # for taskJobParam in self.taskJobParamList: # paramMap[taskJobParam.paramNameEn]=taskJobParam.paramValue tableName = jobTemplate.tableName jobTemplateId = jobTemplate.id databaseId = jobTemplate.databaseId if jobTemplate.databaseId != "-1" and jobTemplate.databaseId != None else self.taskJob.databaseId db = self.dbclient.getConnection(databaseId) if db == None: logging.warning('db is null,please check it with databaseid :%s' % databaseId) if llen( ConfigUtils.getRedisPorperties( KEYMAP.MAIN_SPIDER_REDIS_KEY)) == 0: if self.taskJob.status != TaskStatus.SUCCESS: TaskJobDao.updateTaskJobStatus(self.taskJob.id, TaskStatus.SUCCESS) UrlDao.updateUrlStatusListByTaskJobHistoryId( self.taskJobHistoryId, status=UrlStatus.STOP, desc="no db") return sqlArray = [] if data == None or len(data) == 0: logging.warning( 'insert data not exist,please retry crawler or check template or check error' ) if llen( ConfigUtils.getRedisPorperties( KEYMAP.MAIN_SPIDER_REDIS_KEY)) == 0: if self.taskJob.status != TaskStatus.SUCCESS: TaskJobDao.updateTaskJobStatus(self.taskJob.id, TaskStatus.SUCCESS) UrlDao.updateUrlStatusListByTaskJobHistoryId( self.taskJobHistoryId, status=UrlStatus.STOP, desc="no data") return logging.info('----pipelines insert data-----%s' % str(data)) for d in data: d["task_job_url"] = url if self.dataParentId != None: d["parent_id"] = self.dataParentId d["id"] = str(uuid.uuid1()) if self.dbclient.db_type == 'kafka': d['TemplateName'] = jobTemplate.name d['UrlStatus'] = 0 d['Timestamps'] = int(time.time()) if self.dbclient.db_type == 'hdfs' or self.dbclient.db_type == 'mongodb': sqlArray.append( db.insert(jobTemplate.id, tableName, d, paramMap)) else: sqlArray.append(db.insert(tableName, d, paramMap)) if jobTemplateId != None: try: childJobTemplateList = TemplateDao.queryJobTemplateListByParentId( jobTemplateId) self.loadNext(childJobTemplateList, dict(extraData.items() + d.items())) except Exception, e: logging.error(e.message)
def startCrawlerByTaskJobId(jobId, taskJobParam=None): logging.info('------startCrawlerByTaskJobId-------%s' % jobId) taskJobHistory = TaskJobDao.addTaskJobHistroy(jobId, TaskJobHistoryType.SINGLE) taskJob = query(TaskJob, text('id="' + str(jobId) + '"'), type=0) TaskJobDao.updateTaskJobStatus(jobId, TaskStatus.RUNNING) # tableName = jobTemplate.tableName jobTemplateId = None dbClient = DbClient() LoggerDao.addTaskJobLogger(taskJob, LoggerDao.LoggerType.START, taskJobHistoryId=taskJobHistory.id, status=TaskStatus.RUNNING, content=u"任务启动") try: if TaskType.SINGLE == str(taskJob.type): jobTemplate = TemplateDao.queryJobTemplate(taskJob.jobTemplateId) if jobTemplate == None: TaskJobDao.updateTaskJobStatus(jobId, TaskStatus.FAIL) LoggerDao.addTaskJobLogger(taskJob, LoggerDao.LoggerType.START, jobTemplateId=jobTemplateId, taskJobHistoryId=taskJobHistory.id, content=u"no jobTemplate") logging.error("no jobTemplate") return jobTemplateId = jobTemplate.id taskJobParam = crawlerbyKeyWord(taskJob, jobTemplate.id) jobTemplateFieldList = TemplateDao.queryJobTemplateFieldByJobTemplateId( jobTemplate.id) dbClient.getConnection(taskJob.databaseId or jobTemplate.databaseId) if not dbClient.isTableExist(jobTemplate.tableName): dbClient.createTable(jobTemplate.id, jobTemplate.tableName, jobTemplateFieldList) setattr(jobTemplate, "url", taskJob.url) setattr(jobTemplate, "tableName", taskJob.tableName) LoggerDao.addTaskJobLogger(taskJob, LoggerDao.LoggerType.START, jobTemplateId=jobTemplate.id, taskJobHistoryId=taskJobHistory.id, status=TaskStatus.RUNNING, content=u"定向任务任务启动") parseUrlAndInsertRedis(taskJob, taskJobParam=taskJobParam, taskJobHistory=taskJobHistory, jobTemplate=jobTemplate) elif TaskType.BATCH == str(taskJob.type): jobTemplateList = TemplateDao.loadTemplateByTaskJobId({ "taskJobId": taskJob.id, "action": "1" }) if jobTemplateList.get("jobTemplateList") != None and len( jobTemplateList.get("jobTemplateList")) > 0: for jobTemplate in jobTemplateList.get("jobTemplateList"): jobTemplateId = jobTemplate.id taskJobParam = crawlerbyKeyWord(taskJob, jobTemplate.id) jobTemplateFieldList = TemplateDao.queryJobTemplateFieldByJobTemplateId( jobTemplate.id) LoggerDao.addTaskJobLogger( taskJob, LoggerDao.LoggerType.START, jobTemplateId=jobTemplate.id, taskJobHistoryId=taskJobHistory.id, status=TaskStatus.RUNNING, content=u"批量任务启动") databaseId = jobTemplate.databaseId if jobTemplate.databaseId != "-1" and jobTemplate.databaseId != None else taskJob.databaseId dbClient.getConnection(databaseId) if dbClient == None: TaskJobDao.updateTaskJobStatus(jobId, TaskStatus.FAIL) LoggerDao.addTaskJobLogger( taskJob, LoggerDao.LoggerType.START, jobTemplateId=jobTemplateId, taskJobHistoryId=taskJobHistory.id, content=u"no dbClient") logging.error("no dbClient") return if not dbClient.isTableExist(jobTemplate.tableName): dbClient.createTable(jobTemplate.id, jobTemplate.tableName, jobTemplateFieldList) parseUrlAndInsertRedis(taskJob, taskJobParam=taskJobParam, taskJobHistory=taskJobHistory, jobTemplate=jobTemplate) else: TaskJobDao.updateTaskJobStatus(jobId, TaskStatus.FAIL) LoggerDao.addTaskJobLogger(taskJob, LoggerDao.LoggerType.START, jobTemplateId=jobTemplateId, taskJobHistoryId=taskJobHistory.id, content=u"no jobTemplate") logging.error("no jobTemplate") elif TaskType.DEPTH == str(taskJob.type): parseUrlAndInsertRedis(taskJob, taskJobParam=taskJobParam, taskJobHistory=taskJobHistory) # print mainId # if tempList: # for temp in tempList: # tempNode = hashConsistency.get_node(stringify(temp)) # nodePool.append(tempNode) # RedisUtils.lpush(tempNode+"_"+ConfigUtils.getRedisPorperties(KEYMAP.ASSIST_SPIDER_REDIS_KEY), taskJobHistory.id) # RedisUtils.lpush(tempNode+"_"+ConfigUtils.getRedisPorperties(KEYMAP.ASSIST_SPIDER_REDIS_KEY) + "_" + taskJobHistory.id,stringify(temp)) # RedisUtils.hset(ConfigUtils.getRedisPorperties(KEYMAP.FINISH_SPIDER_REDIS_KEY), temp.id, stringify(temp)) except Exception, e: TaskJobDao.updateTaskJobStatus(jobId, TaskStatus.FAIL) LoggerDao.addTaskJobLogger(taskJob, LoggerDao.LoggerType.START, jobTemplateId=jobTemplateId, taskJobHistoryId=taskJobHistory.id, content=u"解析异常" + str(e)) logging.error(repr(Exception))
def parseUrlAndInsertRedis(taskJob, paramMap={}, taskJobParam=None, taskJobHistory=None, jobTemplate=None): if TaskType.DEPTH == str(taskJob.type): if bloomfilter_check(taskJob.id, taskJob.url): RedisUtils.lpush( ConfigUtils.getRedisPorperties(KEYMAP.DEPTH_SPIDER_REDIS_KEY), taskJobHistory.id) RedisUtils.lpush( ConfigUtils.getRedisPorperties(KEYMAP.DEPTH_SPIDER_REDIS_KEY) + "_" + taskJobHistory.id, stringify(taskJob)) else: url = taskJob.url taskJobParamList = TaskJobDao.queryTaskJobParam(taskJob.id) if taskJobParam != None: if isinstance(taskJobParam, list): taskJobParamList.extend(taskJobParam) else: taskJobParamList.append(taskJobParam) jobTemplateParamList = TemplateDao.queryJobTemplateParamByJobTemplateId( jobTemplate.id) if jobTemplateParamList != None and len(jobTemplateParamList) > 0: taskJobParamList.extend(jobTemplateParamList) if taskJobHistory != None: jobTemplateParamTaskJob = JobTemplateParam( paramNameEn="task_job_id_sequence", paramValue=str(taskJobHistory.id)) jobTemplateParamList.append(jobTemplateParamTaskJob) if taskJobParamList == None or len(taskJobParamList) <= 0: if str(taskJob.type) == TaskType.BATCH: url = jobTemplate.url renderUrl = RenderUtils.render(url, paramMap) # if bloomfilter_check(taskJob.id, renderUrl): newJobTemplate = ClassCopy.copyToNewInstances( jobTemplate, JobTemplate) taskJobHistoryId = taskJobHistory.id urlListStatus = UrlClazz(url=jobTemplate.url, parentUrl=paramMap.get("task_job_url"), jobTemplateId=jobTemplate.id, jobTemplateParentId=jobTemplate.parentId, taskJobId=taskJob.id, taskJobHistoryId=taskJobHistoryId) # try: # request = urllib2.Request( # url=url, # headers={ # 'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'} # ) # response = urllib2.urlopen(request) # urldate = response.headers['date'] # except Exception: # pass # print Exception setattr(newJobTemplate, "taskJobId", taskJob.id) setattr(newJobTemplate, "taskJobHistoryId", taskJobHistoryId) setattr(newJobTemplate, "url", renderUrl) setattr(newJobTemplate, "extraData", paramMap) # setattr(newJobTemplate, "urldate", urldate) setattr(newJobTemplate, "urlListStatusId", urlListStatus.id) LoggerDao.addTaskJobLogger(taskJob, LoggerDao.LoggerType.URL_TO_REDIS, jobTemplateId=newJobTemplate.id, taskJobHistoryId=taskJobHistoryId, content=u"redis_入库", url=renderUrl, status=TaskStatus.RUNNING) # if (hashswitch): # tempList.append(stringify(newJobTemplate)) # else: # mainId.append(stringify(newJobTemplate)) RedisUtils.lpush( ConfigUtils.getRedisPorperties(KEYMAP.ASSIST_SPIDER_REDIS_KEY), taskJobHistoryId) RedisUtils.lpush( ConfigUtils.getRedisPorperties(KEYMAP.ASSIST_SPIDER_REDIS_KEY) + "_" + taskJobHistoryId, stringify(newJobTemplate)) RedisUtils.hset( ConfigUtils.getRedisPorperties(KEYMAP.FINISH_SPIDER_REDIS_KEY), newJobTemplate.id, stringify(newJobTemplate)) saveUrlListStatus(urlListStatus) else: for data in paraseJobTemplateList(taskJobParamList, paramMap): if str(taskJob.type) == TaskType.BATCH: url = jobTemplate.url parentId = paramMap.get("dataParentId") paramMap = dict(paramMap.items() + data.items()) renderUrl = RenderUtils.render(url, paramMap) # if bloomfilter_check(taskJob.id, renderUrl): newJobTemplate = ClassCopy.copyToNewInstances( jobTemplate, JobTemplate) taskJobHistoryId = taskJobHistory.id urlListStatus = UrlClazz( url=renderUrl, parentUrl=paramMap.get("task_job_url"), jobTemplateId=jobTemplate.id, jobTemplateParentId=jobTemplate.parentId, taskJobId=taskJob.id, taskJobHistoryId=taskJobHistoryId) # try: # request = urllib2.Request( # url=url, # headers={ # 'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6' # } # ) # response = urllib2.urlopen(request) # urldate = response.headers['date'] # except Exception: # pass # print Exception setattr(newJobTemplate, "taskJobId", taskJob.id) setattr(newJobTemplate, "taskJobHistoryId", taskJobHistoryId) setattr(newJobTemplate, "url", renderUrl) setattr(newJobTemplate, "dataParentId", parentId) setattr(newJobTemplate, "extraData", paramMap) # setattr(newJobTemplate, "urldate", urldate) setattr(newJobTemplate, "urlListStatusId", urlListStatus.id) LoggerDao.addTaskJobLogger(taskJob, LoggerDao.LoggerType.URL_TO_REDIS, jobTemplateId=newJobTemplate.id, taskJobHistoryId=taskJobHistoryId, content=u"redis_入库_多参数", url=renderUrl, status=TaskStatus.RUNNING) # if (hashswitch): # tempList.append(newJobTemplate) # else: RedisUtils.lpush( ConfigUtils.getRedisPorperties( KEYMAP.ASSIST_SPIDER_REDIS_KEY), taskJobHistoryId) RedisUtils.lpush( ConfigUtils.getRedisPorperties( KEYMAP.ASSIST_SPIDER_REDIS_KEY) + "_" + taskJobHistoryId, stringify(newJobTemplate)) # mainId.append(stringify(newJobTemplate)) RedisUtils.hset( ConfigUtils.getRedisPorperties( KEYMAP.FINISH_SPIDER_REDIS_KEY), newJobTemplate.id, stringify(newJobTemplate)) saveUrlListStatus(urlListStatus)
def startCrawlerByTemplateId(templateId, jobTemplateParamList, taskJobHistory): jobTemplate = TemplateDao.queryJobTemplate(templateId) if jobTemplate == None: return parseUrlAndInsertRedis(jobTemplate, {}, jobTemplateParamList, taskJobHistory)