def insert(self, jobid, tablename, column_dict, paramMap=None): if tablename == None: taskJob = TaskJobDao.loadTaskById(jobid) tablename = taskJob.tableName keys = [] for v in range(len(column_dict.keys())): keys.append(" " + column_dict.keys()[v] + " ") vals = list(column_dict.values()) valueslist = [] for v in range(len(vals)): valueslist.append(" " + MySQLdb.escape_string(vals[v]) + " ") # valueslist.append("'"+str(uuid.uuid1())+"'") createTime = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) valueslist.append("False") valueslist.append(createTime) keys.append("task_job_del_flag") keys.append("task_job_create_time") # keys.append("`parent_id`") task_job_id_sequenceValue = paramMap.get( "task_job_id_sequence") if paramMap != None else None if task_job_id_sequenceValue != None: valueslist.append("" + str(task_job_id_sequenceValue) + "") keys.append("task_job_id_sequence") fielddic = dict(zip(keys, valueslist)) collection = self.db[tablename] collection.insert(fielddic)
def createTableByTaskJobId(self, taskJobId, tableName=None, jobTemplateFieldList=None, data=None): if tableName == None: taskJob = TaskJobDao.loadTaskById(taskJobId) tableName = taskJob.tableName path = self.path + '/' + tableName self.hdfs.create(path, data, replication=2)
def createTableByTaskJobId(self, jobid, tableName=None, jobTemplateFieldList=None): """ 创建collection :param taskJobId: :return: """ if tableName == None: taskJob = TaskJobDao.loadTaskById(jobid) tableName = taskJob.tableName # if self.isTableExist(tableName): # logging.info('isTableExist:%s' % ('TRUE')) # return if jobTemplateFieldList == None or len(jobTemplateFieldList) == 0: jobTemplateFieldList = TemplateDao.queryJobTemplateFieldByJobTemplateId( jobid) # (jobid) if (jobTemplateFieldList == None or len(jobTemplateFieldList) == 0): return fieldList = [] for jobTemplateField in jobTemplateFieldList: dataLength = jobTemplateField.dataLength dataType = jobTemplateField.dataType or "varchar" fieldNameEn = jobTemplateField.fieldNameEn if dataType == 'int': fieldList.append("`%s` %s" % (fieldNameEn, dataType)) elif dataLength != None and dataLength > 0 or ( dataLength == None and dataType == "varchar"): if dataType != 'int': dataLength = "1024" fieldList.append("`%s` %s(%s)" % (fieldNameEn, dataType, dataLength)) else: fieldList.append("`%s` %s" % (fieldNameEn, dataType)) fieldList.append("id ") fieldList.append("task_job_create_time") fieldList.append("task_job_del_flag ") fieldList.append("task_job_id_sequence") fieldList.append("parent_id ") fieldList.append("task_job_url ") fielddic = {} collection = self.db[tableName] for index, item in enumerate(fieldList): if item == 'task_job_create_time': fielddic[item] = time.strftime('%Y-%m-%d %H:%M:%S') else: fielddic[item] = '' collection.insert(fielddic)
def createTableByTaskJobId(self, taskJobId, tableName=None, jobTemplateFieldList=None): """ 创建数据库表 :param taskJobId: :return: """ if tableName == None: taskJob = TaskJobDao.loadTaskById(taskJobId) tableName = taskJob.tableName if self.isTableExist(tableName): logging.info('isTableExist:%s' % ('TRUE')) return if jobTemplateFieldList == None or len(jobTemplateFieldList) == 0: jobTemplateFieldList = TemplateDao.queryJobTemplateFieldByJobTemplateId( taskJobId) #(jobid) if (jobTemplateFieldList == None or len(jobTemplateFieldList) == 0): return fieldList = [] for jobTemplateField in jobTemplateFieldList: dataLength = jobTemplateField.dataLength dataType = str(jobTemplateField.dataType) or "varchar" fieldNameEn = str(jobTemplateField.fieldNameEn) if dataType == 'int': fieldList.append("%s %s" % (fieldNameEn, dataType)) elif dataLength != None and dataLength > 0 or ( dataLength == None and dataType == "varchar"): if dataType != 'int': dataLength = "1024" fieldList.append("%s %s(%s)" % (fieldNameEn, dataType, dataLength)) else: fieldList.append("%s %s" % (fieldNameEn, dataType)) fieldList.append("id varchar(50) primary key") fieldList.append("task_job_create_time datetime") fieldList.append("task_job_del_flag int") fieldList.append("task_job_id_sequence varchar(50)") fieldList.append("parent_id varchar(50)") fieldList.append("task_job_url varchar(255)") create_table_sql = "create table %s(%s)" % (tableName, ",".join(fieldList)) self.execute(create_table_sql)
def createTableByTaskJobId(self,jobid,tableName=None,jobTemplateFieldList=None): """ 创建数据库表 :param taskJobId: :return: """ if tableName==None: taskJob = TaskJobDao.loadTaskById(jobid) tableName = taskJob.tableName # if self.isTableExist(tableName): # logging.info('isTableExist:%s' % ('TRUE')) # return if jobTemplateFieldList==None or len(jobTemplateFieldList)==0: jobTemplateFieldList = TemplateDao.queryJobTemplateFieldByJobTemplateId(jobid) #(jobid) if (jobTemplateFieldList == None or len(jobTemplateFieldList) == 0): return fieldList = [] for jobTemplateField in jobTemplateFieldList: dataLength = jobTemplateField.dataLength dataType = jobTemplateField.dataType or "varchar" fieldNameEn = jobTemplateField.fieldNameEn if dataType=='int': fieldList.append("`%s` %s" % (fieldNameEn, dataType)) elif dataLength != None and dataLength > 0 or (dataLength==None and dataType=="varchar"): if dataType!='int': dataLength="1024" fieldList.append("`%s` %s(%s)" % (fieldNameEn, dataType, dataLength)) else: fieldList.append("`%s` %s" % (fieldNameEn, dataType)) fieldList.append("id varchar(50) primary key") fieldList.append("task_job_create_time datetime") fieldList.append("task_job_del_flag tinyint") fieldList.append("task_job_id_sequence varchar(50)") fieldList.append("parent_id varchar(50)") fieldList.append("task_job_url varchar(1024)") create_table_sql = "create table %s(%s)" % (tableName, ",".join(fieldList)) self.cursor.execute(create_table_sql) #增加查询字段的索引来提高效率 self.cursor.execute("alter table `%s` add index index_name(`parent_id`,`task_job_id_sequence`)"%(tableName))
def insert(self, jobid, tablename, column_dict, paramMap=None): if tablename == None: taskJob = TaskJobDao.loadTaskById(jobid) tablename = taskJob.tableName path = self.path + '/' + tablename createTime = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) task_job_id_sequenceValue = paramMap.get( "task_job_id_sequence") if paramMap != None else None if task_job_id_sequenceValue != None: column_dict.update( {"task_job_id_sequence": str(task_job_id_sequenceValue)}) column_dict.update({ "task_job_del_flag": "False", "task_job_create_time": createTime }) # self.append(path, column_dict) if self.isTableExist(tablename): self.append(path, column_dict) else: self.createTableByTaskJobId(jobid, tablename, column_dict)
def beforeStartUrl(self, dataDict): if (dataDict == None): return dataDict id = dataDict.get("id") if id == None: return status = RedisUtils.hgetUrlRedisStatus(RedisUtils.prefix + id) taskJobHistoryId = dataDict.get("taskJobHistoryId") if taskJobHistoryId: taskJobHistory = TaskJobDao.loadTaskJobHistoryById( taskJobHistoryId) if taskJobHistory: taskJobId = taskJobHistory.taskJobId self.taskJob = TaskJobDao.loadTaskById(taskJobId) self.taskJobHistory = taskJobHistory url = dataDict["url"] if dataDict.has_key( "url") else "http://www.baidu.com" self.url = url if self.allowed_domain is None: self.allowed_domain = self.get_first_domain(self.get_domain(url)) self.cur_url_depth = dataDict.get("curUrlDepth") self.depth_limit = dataDict.get("depthLimit") if dataDict.has_key( "depthLimit") else 3 return url
def next_requests(self): use_set = self.settings.getbool('REDIS_START_URLS_AS_SET', defaults.START_URLS_AS_SET) fetch_one = self.server.spop if use_set else self.server.lpop if not LicenseUtils.check_licence(LicenseUtils.get_mac_address()): reactor.stop() """Returns a request to be scheduled or none.""" # XXX: Do we need to use a timeout here? found = 0 while found < self.redis_batch_size: redis_key = fetch_one(self.redis_key) taskJobHistoryId = redis_key if taskJobHistoryId != None: taskJobHistory = loadTaskJobHistoryById(taskJobHistoryId) if taskJobHistory: taskJobId = taskJobHistory.taskJobId taskJob = TaskJobDao.loadTaskById(taskJobId) if taskJob and taskJob.status == TaskStatus.PAUSE: RedisUtils.lpush( ConfigUtils.getRedisPorperties( KEYMAP.ASSIST_SPIDER_REDIS_KEY), taskJobHistoryId) break else: break else: break if hashswitch: if str(localIP) != str(tjs.get_node(redis_key)): RedisUtils.lpush(self.redis_key, redis_key) return redis_key = self.redis_key + "_" + redis_key orginData = fetch_one(redis_key) data = None # data = fetch_one(self.redis_key) try: logging.info("orginData==" + orginData) orginData = json.loads(orginData) orginData["taskJobHistoryId"] = taskJobHistoryId data = self.beforeStartUrl(orginData) except Exception, e: logging.error("Error e:") logging.error(e) logging.error(orginData) break if not data: # Queue empty. logging.warning('********dataUrl is null*************') break try: req = self.make_request_from_data(data) # req.replace(meta={"id":"123"}) req.meta["id"] = orginData.get("id") req.meta["dataParentId"] = orginData.get("dataParentId") req.meta["taskJobHistoryId"] = orginData.get( "taskJobHistoryId") req.meta["url"] = orginData.get("url") urlListStatusId = req.meta["urlListStatusId"] = orginData.get( "urlListStatusId") except Exception, e: logging.error("make_request_from_data:e:" + e) break