def taskJobReTemplateDetail(taskJobId): # sql=text("(select job_template_id,field_ids from task_job_re_template where task_job_id='%s' and del_flag=0) A"%(taskJobId)) taskJobReTemplateList=Session.query(TaskJobReTemplate).\ filter(TaskJobReTemplate.taskJobId==taskJobId,TaskJobReTemplate.delFlag==False).all() job_template_idSql=text("(select job_template_id from task_job_re_template where task_job_id='%s' and del_flag=0)"%(taskJobId)) jobTemplateList=Session.query(JobTemplate).filter(JobTemplate.delFlag==False,JobTemplate.id.in_(job_template_idSql)).all() fieldMap={} for taskJobReTemplate in taskJobReTemplateList: fieldMap[taskJobReTemplate.jobTemplateId]=taskJobReTemplate.fieldIds resultList=[] for jobTemplate in jobTemplateList: dataMap={} jobTemplateFieldList=Session.query(JobTemplateField).\ filter(JobTemplateField.delFlag==False,JobTemplateField.jobTemplateId==jobTemplate.id).all() dataMap["jobTemplate"]=jobTemplate fields=fieldMap[jobTemplate.id] for jobTemplateField in jobTemplateFieldList: fieldArray = fields.split(",") if jobTemplateField.id in fieldArray or fields=="-1": setattr(jobTemplateField,"chooseFlag",True) else: setattr(jobTemplateField, "chooseFlag", True) dataMap["jobTemplateFieldList"]=jobTemplateFieldList resultList.append(dataMap) return resultList
def checkFinishJob(): keys=RedisUtils.hkeys(ConfigUtils.getRedisPorperties(KEYMAP.FINISH_SPIDER_REDIS_KEY)) for key in keys : temp=RedisUtils.hget(ConfigUtils.getRedisPorperties(KEYMAP.FINISH_SPIDER_REDIS_KEY), key) newJobTemplate=json.loads(temp) url=newJobTemplate['url'] try: request = urllib2.Request( url=url, headers=(random.choice(user_agent_list)) ) response = urllib2.urlopen(request) urldate = response.headers['date'] tempDate= newJobTemplate['urldate'] print urldate print tempDate if urldate == tempDate: pass else: newJobTemplate['urldate']=urldate taskJobHistoryId = newJobTemplate['taskJobHistoryId'] taskJobHistory=Session.query(TaskJobHistory).filter(TaskJobHistory.id==taskJobHistoryId,TaskJobHistory.delFlag==False).order_by(" create_time desc").first() taskJob=Session.query(TaskJob).filter(TaskJob.id==taskJobHistory.taskJobId).first() LoggerDao.addTaskJobLogger(taskJob,LoggerDao.LoggerType.URL_TO_REDIS, jobTemplateId=newJobTemplate['id'],taskJobHistoryId=taskJobHistoryId, content=u"redis_入库",url=url,status=TaskStatus.RUNNING) RedisUtils.lpush(ConfigUtils.getRedisPorperties(KEYMAP.ASSIST_SPIDER_REDIS_KEY), taskJobHistoryId) RedisUtils.lpush(ConfigUtils.getRedisPorperties(KEYMAP.ASSIST_SPIDER_REDIS_KEY) + "_" + taskJobHistoryId,stringify(newJobTemplate)) RedisUtils.hset(ConfigUtils.getRedisPorperties(KEYMAP.FINISH_SPIDER_REDIS_KEY), newJobTemplate['id'],stringify(newJobTemplate)) except Exception,e: pass print e
def parseTemplate(id, paramMap={}, flag=True): jobTemplate = Session.query(JobTemplate).filter( JobTemplate.id == id, JobTemplate.delFlag == False).first() url = jobTemplate.url jobTemplateParamList = Session.query(JobTemplateParam).filter( JobTemplateParam.jobTemplateId == id, JobTemplateParam.delFlag == False).all() jobTemplateFieldList = Session.query(JobTemplateField).filter( JobTemplateField.jobTemplateId == id, JobTemplateField.delFlag == False).all() paramList = None renderUrl = None if (flag): paramList = CrawlerService.paraseJobTemplateList(jobTemplateParamList or [], paramMap=paramMap, loopFlag=True) renderUrl = RenderUtils.render(url, paramList and paramList[0]) return { "jobTemplate": jobTemplate, "renderUrl": renderUrl, "jobTemplateFieldList": jobTemplateFieldList, "childJobTemplateList": Session.query(JobTemplate).filter( JobTemplate.delFlag == False, JobTemplate.parentId == jobTemplate.id).all() }
def isHasChildTemplate(jobTemplateId): if (type(jobTemplateId) == list): resultList = Session.query(JobTemplate).filter(JobTemplate.parentId==jobTemplateId.in_(jobTemplateId),JobTemplate.delFlag == False).all() else: resultList = Session.query(JobTemplate).filter(JobTemplate.parentId == jobTemplateId, JobTemplate.delFlag == False).all() return True if len(resultList) > 0 else False
def getTaskList(page=Page(1, 10), userId=None, paramMap=None, orderBy=text('task_job.create_time desc'), parentId=None): # sql = None # if(userId==None): # sql=text(" del_flag=0") # else: # sql=text("create_user_id='" + userId + "' and del_flag=0") queryParam = [] if parentId == None: queryParam.append( text( "(ifnull(task_job.parent_id,'')='' or task_job.parent_id='-1')" )) else: queryParam.append(TaskJob.parentId == parentId) if userId != None and userId != "-1" and userId != "": queryParam.append(TaskJob.createUserId == userId) queryParam.append(TaskJob.delFlag == False) queryParam = BaseDao.initialQueryParam(paramMap or {}, TaskJob, queryParam) taskList = apply(Session.query(TaskJob).filter, tuple(queryParam)) if orderBy == None: orderBy = text('task_job.create_time desc') if parentId == None: taskList = taskList.order_by(orderBy).offset(page.offset).limit( page.limit) taskList = taskList.all() count = apply(Session.query(TaskJob).filter, tuple(queryParam)).count() return {"taskList": taskList, "count": count}
def queryTaskJobResult(taskJob,page=1,pageCount=10,jobTemplateId=None,parentId=None,taskHistoryId=None): from utils.DBClient import DbClient import json jobTemplate=None result=None count=None tableTitle=None db=DbClient() page=page or 1 pageCount=pageCount or 10 databaseId=None taskJobId=None if taskJob!=None: slavedb=db.getConnection(taskJob.databaseId) databaseId=taskJob.databaseId taskJobId = taskJob.id if jobTemplateId!=None: jobTemplate = Session.query(JobTemplate).filter(JobTemplate.id == jobTemplateId).first() slavedb = db.getConnection(jobTemplate.databaseId) databaseId=jobTemplate,databaseId if (taskJob!=None and taskJob.type==TaskJobHistoryType.BATCH) or jobTemplate!=None: taskJob=jobTemplate # result = slavedb.query_table_pages(jobTemplate.tableName, page, pageCount,parentId) # count = slavedb.query_count(jobTemplate.tableName,parentId) # tableTitle = slavedb.queryTableTitle(taskJob.tableName) # return # else: if taskHistoryId==None: taskJobHistory=Session.query(TaskJobHistory).filter(TaskJobHistory.taskJobId==taskJobId,TaskJobHistory.delFlag==False).\ order_by(text(" create_time desc ")).first() taskHistoryId=taskJobHistory and taskJobHistory.id result = slavedb.query_table_pages(taskJob.tableName, page, pageCount,parentId,taskHistoryId or "none") count=slavedb.query_count(taskJob.tableName,parentId,taskHistoryId or 'none') tableTitle=slavedb.queryTableTitle(taskJob.tableName,slavedb and slavedb.db_config and slavedb.db_config.get("db")) return {'result':json.loads(result),'title':json.loads(tableTitle),'count':json.loads(count)}
def getTaskList(page, userId=None, paramMap=None, orderBy=text('search_task_job.create_time desc'), parentId=None, status=None): queryParam = [] # if parentId == None: # queryParam.append(text("(ifnull(task_job.parent_id,'')='' or task_job.parent_id='-1')")) # else: # queryParam.append(SearchTaskJob.childIds == parentId) if userId != None and userId != "-1" and userId != "": queryParam.append(SearchTaskJob.createUserId == userId) queryParam.append(SearchTaskJob.delFlag == False) queryParam = BaseDao.initialQueryParam(paramMap or {}, SearchTaskJob, queryParam) taskList = apply(Session.query(SearchTaskJob).filter, tuple(queryParam)) if orderBy == None: orderBy = text('p_search_task_job.create_time desc') if status != None: taskList = taskList.filter(SearchTaskJob.status == status) if parentId == None: taskList = taskList.order_by(orderBy).offset(page.offset).limit( page.limit) taskList = taskList.all() count = apply(Session.query(SearchTaskJob).filter, tuple(queryParam)).count() return {"taskList": taskList, "count": count}
def relationTaskJob(jobTemplateId,taskJobId,fieldIds): TaskJobDao.delTaskJobReRelationByTaskJobId(taskJobId) if fieldIds!=None and str(fieldIds)=="-1": jobTemplateFieldList=TemplateDao.queryJobTemplateFieldByJobTemplateId(jobTemplateId) for field in jobTemplateFieldList: taskJobReField = TaskJobReField(id=uuid.uuid1()) taskJobReField.jobTemplateId = jobTemplateId taskJobReField.taskJobId = taskJobId taskJobReField.delFlag = False taskJobReField.jobTemplateFieldId = field.id taskJobReField.createTime=datetime.now() Session.add(taskJobReField) return fieldList=fieldIds.split(",") for fieldId in fieldList: taskJobReField=TaskJobReField(id=uuid.uuid1()) taskJobReField.jobTemplateId=jobTemplateId taskJobReField.taskJobId=taskJobId taskJobReField.delFlag=False taskJobReField.jobTemplateFieldId=fieldId taskJobReField.createTime=datetime.now() Session.add(taskJobReField) # TaskJobDao.updateTaskJob(TaskJob.id==taskJobId,{TaskJob.jobTemplateId:jobTemplateId}) Session.query(TaskJob).filter(TaskJob.id==taskJobId).update({TaskJob.jobTemplateId:jobTemplateId}) Session.flush() Session.commit()
def addTaskJobHistroy(taskJobId, type): taskJobHistory = TaskJobHistory(id=uuid.uuid1(), taskJobId=taskJobId, createTime=datetime.now(), type=type, status=0) Session.add(taskJobHistory) return taskJobHistory
def delTaskJob(taskId): taskIds = taskId.split(",") for i in range(len(taskIds)): LoggerDao.addTaskJobLogger(TaskJob(id=taskIds[i]), LoggerDao.LoggerType.DEL, content=u"删除任务") Session.query(TaskJob).filter(TaskJob.id.in_(tuple(taskIds))).update( {TaskJob.delFlag: True}, synchronize_session='fetch')
def contentDetail(id): proxy = Session.query(ProxyInfo).filter( ProxyInfo.id == id, ProxyInfo.delFlag == False).first() staticProxyList = None if proxy != None and proxy.type == ProxyType.STATIC: staticProxyList = Session.query(StaticProxyInfo).filter( StaticProxyInfo.proxyInfoId == id, ProxyInfo.delFlag == False).all() return {"result": {"proxy": proxy, "staticProxyList": staticProxyList}}
def proxyAdd(proxy): type = proxy.type if type == ProxyType.STATIC: ipList = RequestUtils.parseProxyFile() result = addStaticProxyList(ipList) else: result = proxyAddProxy(proxy) Session.add(proxy) Session.commit() return {"result": result}
def isBindTaskJob(jobTemplateId): resultList=[] temp = queryParentTemplateByChildTemplateId(jobTemplateId) if(type(jobTemplateId)==list): resultList = Session.query(TaskJobReTemplate).filter(TaskJobReTemplate.delFlag == False, TaskJobReTemplate.jobTemplateId.in_(jobTemplateId)).all() else: if temp : jobTemplateId = temp.id resultList=Session.query(TaskJobReTemplate).filter(TaskJobReTemplate.delFlag==False,TaskJobReTemplate.jobTemplateId==jobTemplateId).all() return True if len(resultList)>0 else False
def queryParentTemplateByChildTemplateId(templateId): temp = Session.query(JobTemplate).filter(JobTemplate.id == templateId, JobTemplate.delFlag == False).first() if temp.parentId != '-1': parentTemplate = Session.query(JobTemplate).filter(JobTemplate.id == temp.parentId, JobTemplate.delFlag == False).first() if parentTemplate.parentId != '-1': return queryParentTemplateByChildTemplateId(parentTemplate.parentId) else: return parentTemplate else: return temp
def queryAllTempByTaskField(taskJobId): taskJobReTemplate = Session.query(TaskJobReTemplate).filter(TaskJobReTemplate.taskJobId == taskJobId).first() result = None if taskJobReTemplate is not None: jobTemplate = Session.query(JobTemplate).filter(JobTemplate.id == taskJobReTemplate.jobTemplateId).first() jobTemplate1 = Session.query(JobTemplate).filter(JobTemplate.parentId == jobTemplate.id).first() sql = "select w1.wechatImage,w1.detailURL,w3.title,w3.time,w3.organization from %s w1 join %s w3 on w1.id=w3.parent_id" % (jobTemplate.tableName,jobTemplate1.tableName) result = BaseDao.execute(sql) if result and len(result.cursor._rows) > 0: result= result.cursor._rows return {'result':result,'count':[[len(result)]]}
def quertJudicature(id): juidicature = Session.query(JudicatureTable).filter( JudicatureTable.id == id, JudicatureTable.delFlag == 0).first() randomList = randomlist(7) juidicature.saving_cnt += randomList[0] juidicature.saved_cnt += randomList[1] juidicature.changed_cnt += randomList[2] juidicature.compromise_cnt += randomList[3] juidicature.pursue_cnt += randomList[4] juidicature.adjudged_cnt += randomList[5] Session.flush() Session.commit() return juidicature
def totalCollectByTemplateType(): result=[] contentTypeList=Session.query(ContentType).all() totleData = 0 for contentType in contentTypeList: count = 0 TemplateList=Session.query(JobTemplate).filter(JobTemplate.jobTemplateType == contentType.id).all() for jobTemplate in TemplateList: count += queryTableCount(jobTemplate) totleData += count result.append({'name':contentType.name,'count':count}) for i in range(0, len(result)): result[i]["percent"] = round(result[i]["count"] / totleData, 2) return result
def anayse(type, taskJobSequenceId): fieldWeightList = Session.query(FieldWeight).filter( FieldWeight.type == type).all() whileData = [] fieldNameList = [] for fieldWeight in fieldWeightList: weight = int(fieldWeight.weight) fieldNameEn = fieldWeight.fieldNameEn fieldNameList.append(fieldNameEn) whileData.append((fieldNameEn, weight or 0)) filterSql = text( "(SELECT parent_id FROM job_template where del_flag=0 GROUP BY parent_id)" ) # whiteJobTemplateList=Session.query(JobTemplate).filter(JobTemplate.id.notin_(filterSql),JobTemplate.delFlag==False,JobTemplate.objectType==1).all() blackJobTemplateList = Session.query(JobTemplate).filter( JobTemplate.jobTemplateType == type, JobTemplate.id.notin_(filterSql), JobTemplate.delFlag == False, JobTemplate.objectType != 1).all() # anaValue=[] jobTemplateInfo = [] for jobTemplate in blackJobTemplateList: # anaMap={"jobTemplate":jobTemplate} tableName = jobTemplate.tableName jobTemplateId = jobTemplate.id # objectType=jobTemplate.objectType jobTemplateFieldList = Session.query(JobTemplateField).filter( JobTemplateField.jobTemplateId == jobTemplateId, JobTemplateField.delFlag == False).all() fieldList = [] for jobTemplateField in jobTemplateFieldList: fieldNameEn = jobTemplateField.fieldNameEn fieldList.append(fieldNameEn) dataSql = text("select %s from %s where task_job_id_sequence='%s'" % (",".join(fieldList), tableName, taskJobSequenceId)) rs = Session.execute(dataSql).fetchall() blackValue = [] for row in rs: map = {} print row for name in fieldNameList: value = row[name] if row.has_key(name) else "" map[name] = value blackValue.append(map) jobTemplateInfo.append({ "value": blackValue, "fieldName": fieldNameList, "jobTemplateInfo": jobTemplate }) return {"value": jobTemplateInfo}
def totalCollectByTemplate(): TemplateList= Session.query(JobTemplate).all() # [{'name':'购物','pers':'','num':''}] result=[] totleData=0 for jobTemplate in TemplateList: if jobTemplate.parentId == '-1': child_temp_list=[] count=0 get_child_templates(jobTemplate.id,child_temp_list) for childTemp in child_temp_list: count=count+queryTableCount(childTemp) count=count+queryTableCount(jobTemplate) totleData+=count result.append({'name':jobTemplate.name,'count':count}) for i in range(0, len(result)): p=result[i]["count"]/float(totleData) result[i]["percent"] =float('%.2f'% p ) result.sort(lambda x, y: cmp(x['count'], y['count'])) result = sorted(result, key=lambda x: x['count']) if len(result)>5: return [result[-1],result[-2],result[-3],result[-4],result[-5],result[-6]] else: return result
def loadTemplate(page=Page(1,10),paramMap={},userId=None,orderBy=text(" job_template.create_time desc "),parentId=None): queryParam = [] # if parentId == None: # queryParam.append(text("(ifnull(job_template.parent_id,'')='' or job_template.parent_id='-1')")) # else: # queryParam.append(JobTemplate.parentId == parentId) if parentId is not None: queryParam.append(JobTemplate.parentId == parentId) if userId != None and userId != "-1" and userId != "": queryParam.append(TaskJob.createUserId == userId) if paramMap and paramMap.has_key("pageNo") and paramMap.has_key("pageSize"): page = Page(paramMap["pageNo"],paramMap["pageSize"]) queryParam.append(JobTemplate.delFlag == False) queryParam = BaseDao.initialQueryParam(paramMap or {}, JobTemplate, queryParam) filterQuery=Session.query(JobTemplate).filter taskList = apply(filterQuery, tuple(queryParam)) if orderBy == None: orderBy = text('task_job.create_time desc') if (parentId == None or parentId == '-1') and page!=None: taskList = taskList.order_by(orderBy).offset(page.offset).limit(page.limit) taskList = taskList.all() count = apply(filterQuery, tuple(queryParam)).count() # if paramMap.has_key('action'): # action = paramMap['action'] # if action == '1': # jobtemp = taskList[0] # jobtemplist = queryAllTempByTaskJob(jobtemp)#递归查询子模板 # if jobtemplist: # taskList.extend(jobtemplist) # count = len(taskList) return {"jobTemplateList": taskList, "count": count}
def get_child_templates(parentId,resultList): childTemplateList = Session.query(JobTemplate).filter(JobTemplate.parentId == parentId,JobTemplate.delFlag==False).all() if childTemplateList is not None and len(childTemplateList)>0: resultList.extend(childTemplateList) for jobTemplate in childTemplateList: get_child_templates(jobTemplate.id,resultList) pass
def runListener(): TemplateList = Session.query(JobTemplate).all() count=0 totle=0 for t in TemplateList: count+=queryTableCount(t) pass print count rs=Session.query(CrawlerDataListener).order_by(CrawlerDataListener.createTime.desc()).limit(1).all() if len(rs)>0: totle=rs[0].totle p=count-int(totle) addCrawlerDataListener(p, count) else: addCrawlerDataListener(0,count) logging.info("runListener BaseDao.add")
def updateTaskJob(taskJob): taskJobDb = Session.query(TaskJob).filter(TaskJob.id == taskJob.id).first() LoggerDao.addTaskJobLogger(taskJobDb, LoggerDao.LoggerType.MOD, content=u"更新任务", status=taskJobDb.status) return BaseDao.updateByPrimary(taskJob, TaskJob)
def getRunningTaskJobList(): sql = text( """SELECT * FROM task_job,( SELECT id AS task_id_sequence,task_job_id,create_time history_create_time FROM (select * from task_job_history order by create_time desc) A WHERE del_flag = 0 GROUP BY task_job_id ORDER BY create_time DESC ) A WHERE A.task_job_id = task_job.id AND task_job. STATUS = 1 AND task_job.del_flag = 0""" ) return Session.execute(sql)
def getConnection(self, base_id): # db_config = { # 'host': ConfigUtils.getMysqlPorperties(KEYMAP.MYSQL_HOST), # 'user': ConfigUtils.getMysqlPorperties(KEYMAP.MYSQL_USERNAME), # 'passwd': ConfigUtils.getMysqlPorperties(KEYMAP.MYSQL_PASSWD), # 'db': ConfigUtils.getMysqlPorperties(KEYMAP.MYSQL_DBNAME), # 'charset': ConfigUtils.getMysqlPorperties(KEYMAP.MYSQL_ENCODE), # 'port': ConfigUtils.getMysqlPorperties(KEYMAP.MYSQL_PORT) # } dbSource = Session.query(DbSource).filter( DbSource.id == base_id).first() if dbSource == None: host = dao.db_config.get("host") dbName = dao.db_config.get("db") userName = dao.db_config.get("user") passwd = dao.db_config.get("passwd") port = dao.db_config.get("port") dbSource = DbSource(dbname=dbName, host=host, port=port, type="mysql", password=passwd, username=userName) if dbSource == None: raise Exception("base_id is not Exist") return None self.cur_database_param = dbSource.__dict__ self.db_type = self.cur_database_param["type"] db = eval(self.db_type) self.conn = db(self.cur_database_param) return self.conn
def contentDetail(taskJobTimerId): taskJobTimerId = str(taskJobTimerId) return { "result": Session.query(TaskJobTimer).filter( TaskJobTimer.id == taskJobTimerId, TaskJobTimer.delFlag == False).first() }
def countObj(userId, clazz, paramMap): queryParam = [] queryParam.append(text("(ifnull(parent_id,'')='' or parent_id='-1')")) if userId != None and userId != "-1" and userId != "": queryParam.append(TaskJob.createUserId == userId) queryParam.append(clazz.delFlag == False) queryParam = BaseDao.initialQueryParam(paramMap, clazz, queryParam) return apply(Session.query(clazz.id).filter, tuple(queryParam)).count()
def validTaskJobName(name, id): if name == None: return True queryParam = [TaskJob.name == name, TaskJob.delFlag == False] if id != None: queryParam.append(TaskJob.id != id) count = apply(Session.query(TaskJob).filter, queryParam).count() return True if count <= 0 else False
def batch_del_templateId(templateId): #删除该模板下的所有子模板 paretTemplate = Session.query(JobTemplate).filter(JobTemplate.id == templateId).first() paretTemplate.delFlag = True childlist=[] get_child_templates(templateId,childlist) for childTemp in childlist: childTemp.delFlag=True
def exportResult(jobTemplateId,page=1,pageCount=10): from utils.DBClient import DbClient import json jobTemplate = Session.query(JobTemplate).filter(JobTemplate.id == jobTemplateId).first() db=DbClient() slavedb=db.getConnection(jobTemplate.databaseId) result=json.loads(slavedb.query_table_pages(jobTemplate.tableName,page,pageCount)) title=json.loads(slavedb.queryTableTitle(jobTemplate.tableName,db.dbparms["dbname"])) return export(jobTemplate.tableName,result,title)