예제 #1
0
 def insert(self, jobid, tablename, column_dict, paramMap=None):
     if tablename == None:
         taskJob = TaskJobDao.loadTaskById(jobid)
         tablename = taskJob.tableName
     keys = []
     for v in range(len(column_dict.keys())):
         keys.append(" " + column_dict.keys()[v] + " ")
     vals = list(column_dict.values())
     valueslist = []
     for v in range(len(vals)):
         valueslist.append(" " + MySQLdb.escape_string(vals[v]) + " ")
     # valueslist.append("'"+str(uuid.uuid1())+"'")
     createTime = time.strftime('%Y-%m-%d %H:%M:%S',
                                time.localtime(time.time()))
     valueslist.append("False")
     valueslist.append(createTime)
     keys.append("task_job_del_flag")
     keys.append("task_job_create_time")
     # keys.append("`parent_id`")
     task_job_id_sequenceValue = paramMap.get(
         "task_job_id_sequence") if paramMap != None else None
     if task_job_id_sequenceValue != None:
         valueslist.append("" + str(task_job_id_sequenceValue) + "")
         keys.append("task_job_id_sequence")
     fielddic = dict(zip(keys, valueslist))
     collection = self.db[tablename]
     collection.insert(fielddic)
예제 #2
0
파일: HdfsUtil.py 프로젝트: ldw0810/Crawler
 def createTableByTaskJobId(self,
                            taskJobId,
                            tableName=None,
                            jobTemplateFieldList=None,
                            data=None):
     if tableName == None:
         taskJob = TaskJobDao.loadTaskById(taskJobId)
         tableName = taskJob.tableName
     path = self.path + '/' + tableName
     self.hdfs.create(path, data, replication=2)
예제 #3
0
    def createTableByTaskJobId(self,
                               jobid,
                               tableName=None,
                               jobTemplateFieldList=None):
        """
                创建collection
                :param taskJobId:
                :return:
                """
        if tableName == None:
            taskJob = TaskJobDao.loadTaskById(jobid)

            tableName = taskJob.tableName

        # if self.isTableExist(tableName):
        #     logging.info('isTableExist:%s' % ('TRUE'))
        #     return
        if jobTemplateFieldList == None or len(jobTemplateFieldList) == 0:
            jobTemplateFieldList = TemplateDao.queryJobTemplateFieldByJobTemplateId(
                jobid)  # (jobid)
        if (jobTemplateFieldList == None or len(jobTemplateFieldList) == 0):
            return
        fieldList = []
        for jobTemplateField in jobTemplateFieldList:
            dataLength = jobTemplateField.dataLength
            dataType = jobTemplateField.dataType or "varchar"
            fieldNameEn = jobTemplateField.fieldNameEn
            if dataType == 'int':
                fieldList.append("`%s` %s" % (fieldNameEn, dataType))
            elif dataLength != None and dataLength > 0 or (
                    dataLength == None and dataType == "varchar"):
                if dataType != 'int':
                    dataLength = "1024"
                fieldList.append("`%s` %s(%s)" %
                                 (fieldNameEn, dataType, dataLength))
            else:
                fieldList.append("`%s` %s" % (fieldNameEn, dataType))
        fieldList.append("id ")
        fieldList.append("task_job_create_time")
        fieldList.append("task_job_del_flag ")
        fieldList.append("task_job_id_sequence")
        fieldList.append("parent_id ")
        fieldList.append("task_job_url ")
        fielddic = {}
        collection = self.db[tableName]
        for index, item in enumerate(fieldList):
            if item == 'task_job_create_time':
                fielddic[item] = time.strftime('%Y-%m-%d %H:%M:%S')
            else:
                fielddic[item] = ''
        collection.insert(fielddic)
예제 #4
0
    def createTableByTaskJobId(self,
                               taskJobId,
                               tableName=None,
                               jobTemplateFieldList=None):
        """
        创建数据库表
        :param taskJobId: 
        :return: 
        """
        if tableName == None:
            taskJob = TaskJobDao.loadTaskById(taskJobId)

            tableName = taskJob.tableName

        if self.isTableExist(tableName):
            logging.info('isTableExist:%s' % ('TRUE'))
            return
        if jobTemplateFieldList == None or len(jobTemplateFieldList) == 0:
            jobTemplateFieldList = TemplateDao.queryJobTemplateFieldByJobTemplateId(
                taskJobId)  #(jobid)
        if (jobTemplateFieldList == None or len(jobTemplateFieldList) == 0):
            return
        fieldList = []
        for jobTemplateField in jobTemplateFieldList:
            dataLength = jobTemplateField.dataLength
            dataType = str(jobTemplateField.dataType) or "varchar"
            fieldNameEn = str(jobTemplateField.fieldNameEn)
            if dataType == 'int':
                fieldList.append("%s %s" % (fieldNameEn, dataType))
            elif dataLength != None and dataLength > 0 or (
                    dataLength == None and dataType == "varchar"):
                if dataType != 'int':
                    dataLength = "1024"
                fieldList.append("%s %s(%s)" %
                                 (fieldNameEn, dataType, dataLength))
            else:
                fieldList.append("%s %s" % (fieldNameEn, dataType))
        fieldList.append("id varchar(50) primary key")
        fieldList.append("task_job_create_time datetime")
        fieldList.append("task_job_del_flag int")
        fieldList.append("task_job_id_sequence varchar(50)")
        fieldList.append("parent_id varchar(50)")
        fieldList.append("task_job_url varchar(255)")
        create_table_sql = "create table %s(%s)" % (tableName,
                                                    ",".join(fieldList))
        self.execute(create_table_sql)
예제 #5
0
    def createTableByTaskJobId(self,jobid,tableName=None,jobTemplateFieldList=None):
        """
                创建数据库表
                :param taskJobId:
                :return:
                """
        if tableName==None:
            taskJob = TaskJobDao.loadTaskById(jobid)

            tableName = taskJob.tableName

        # if self.isTableExist(tableName):
        #     logging.info('isTableExist:%s' % ('TRUE'))
        #     return
        if jobTemplateFieldList==None or len(jobTemplateFieldList)==0:
            jobTemplateFieldList = TemplateDao.queryJobTemplateFieldByJobTemplateId(jobid) #(jobid)
        if (jobTemplateFieldList == None or len(jobTemplateFieldList) == 0):
            return
        fieldList = []
        for jobTemplateField in jobTemplateFieldList:
            dataLength = jobTemplateField.dataLength
            dataType = jobTemplateField.dataType or "varchar"
            fieldNameEn = jobTemplateField.fieldNameEn
            if dataType=='int':
                fieldList.append("`%s` %s" % (fieldNameEn, dataType))
            elif dataLength != None and dataLength > 0 or (dataLength==None and dataType=="varchar"):
                if dataType!='int':
                    dataLength="1024"
                fieldList.append("`%s` %s(%s)" % (fieldNameEn, dataType, dataLength))
            else:
                fieldList.append("`%s` %s" % (fieldNameEn, dataType))
        fieldList.append("id varchar(50) primary key")
        fieldList.append("task_job_create_time datetime")
        fieldList.append("task_job_del_flag tinyint")
        fieldList.append("task_job_id_sequence varchar(50)")
        fieldList.append("parent_id varchar(50)")
        fieldList.append("task_job_url varchar(1024)")
        create_table_sql = "create table %s(%s)" % (tableName, ",".join(fieldList))
        self.cursor.execute(create_table_sql)
        #增加查询字段的索引来提高效率
        self.cursor.execute("alter table `%s` add index index_name(`parent_id`,`task_job_id_sequence`)"%(tableName))
예제 #6
0
파일: HdfsUtil.py 프로젝트: ldw0810/Crawler
 def insert(self, jobid, tablename, column_dict, paramMap=None):
     if tablename == None:
         taskJob = TaskJobDao.loadTaskById(jobid)
         tablename = taskJob.tableName
     path = self.path + '/' + tablename
     createTime = time.strftime('%Y-%m-%d %H:%M:%S',
                                time.localtime(time.time()))
     task_job_id_sequenceValue = paramMap.get(
         "task_job_id_sequence") if paramMap != None else None
     if task_job_id_sequenceValue != None:
         column_dict.update(
             {"task_job_id_sequence": str(task_job_id_sequenceValue)})
     column_dict.update({
         "task_job_del_flag": "False",
         "task_job_create_time": createTime
     })
     # self.append(path, column_dict)
     if self.isTableExist(tablename):
         self.append(path, column_dict)
     else:
         self.createTableByTaskJobId(jobid, tablename, column_dict)
예제 #7
0
 def beforeStartUrl(self, dataDict):
     if (dataDict == None):
         return dataDict
     id = dataDict.get("id")
     if id == None:
         return
     status = RedisUtils.hgetUrlRedisStatus(RedisUtils.prefix + id)
     taskJobHistoryId = dataDict.get("taskJobHistoryId")
     if taskJobHistoryId:
         taskJobHistory = TaskJobDao.loadTaskJobHistoryById(
             taskJobHistoryId)
         if taskJobHistory:
             taskJobId = taskJobHistory.taskJobId
             self.taskJob = TaskJobDao.loadTaskById(taskJobId)
             self.taskJobHistory = taskJobHistory
     url = dataDict["url"] if dataDict.has_key(
         "url") else "http://www.baidu.com"
     self.url = url
     if self.allowed_domain is None:
         self.allowed_domain = self.get_first_domain(self.get_domain(url))
     self.cur_url_depth = dataDict.get("curUrlDepth")
     self.depth_limit = dataDict.get("depthLimit") if dataDict.has_key(
         "depthLimit") else 3
     return url
예제 #8
0
    def next_requests(self):
        use_set = self.settings.getbool('REDIS_START_URLS_AS_SET',
                                        defaults.START_URLS_AS_SET)
        fetch_one = self.server.spop if use_set else self.server.lpop

        if not LicenseUtils.check_licence(LicenseUtils.get_mac_address()):
            reactor.stop()
        """Returns a request to be scheduled or none."""

        # XXX: Do we need to use a timeout here?
        found = 0
        while found < self.redis_batch_size:
            redis_key = fetch_one(self.redis_key)
            taskJobHistoryId = redis_key
            if taskJobHistoryId != None:
                taskJobHistory = loadTaskJobHistoryById(taskJobHistoryId)
                if taskJobHistory:
                    taskJobId = taskJobHistory.taskJobId
                    taskJob = TaskJobDao.loadTaskById(taskJobId)
                    if taskJob and taskJob.status == TaskStatus.PAUSE:
                        RedisUtils.lpush(
                            ConfigUtils.getRedisPorperties(
                                KEYMAP.ASSIST_SPIDER_REDIS_KEY),
                            taskJobHistoryId)
                        break
                else:
                    break
            else:
                break
            if hashswitch:
                if str(localIP) != str(tjs.get_node(redis_key)):
                    RedisUtils.lpush(self.redis_key, redis_key)
                    return

            redis_key = self.redis_key + "_" + redis_key
            orginData = fetch_one(redis_key)
            data = None
            # data = fetch_one(self.redis_key)
            try:
                logging.info("orginData==" + orginData)
                orginData = json.loads(orginData)
                orginData["taskJobHistoryId"] = taskJobHistoryId
                data = self.beforeStartUrl(orginData)
            except Exception, e:
                logging.error("Error e:")
                logging.error(e)
                logging.error(orginData)
                break
            if not data:
                # Queue empty.
                logging.warning('********dataUrl is null*************')
                break
            try:
                req = self.make_request_from_data(data)
                # req.replace(meta={"id":"123"})
                req.meta["id"] = orginData.get("id")
                req.meta["dataParentId"] = orginData.get("dataParentId")
                req.meta["taskJobHistoryId"] = orginData.get(
                    "taskJobHistoryId")
                req.meta["url"] = orginData.get("url")
                urlListStatusId = req.meta["urlListStatusId"] = orginData.get(
                    "urlListStatusId")
            except Exception, e:
                logging.error("make_request_from_data:e:" + e)
                break