Пример #1
0
    def execute(self, task):
        ele = task.crawlerEle
        paginateXpath = ele.attrib[datanode.PAGINATE_XPATH] if ele.attrib.has_key(datanode.PAGINATE_XPATH) else None
        if paginateXpath == None or task.hasPagiNate:
            return
        try:
            htmlNode = task.htmlNode
            if None != htmlNode:
                loopEles = parseutil.selectNodes(paginateXpath, task.frontier.getNameSpace(), htmlNode)
                paginateMaxXpath = ele.attrib[datanode.PAGINATE_MAX_XPATH] if ele.attrib.has_key(
                    datanode.PAGINATE_MAX_XPATH) else None
                paginateMaxRule = ele.attrib[datanode.PAGINATE_MAX_RULE] if ele.attrib.has_key(
                    datanode.PAGINATE_MAX_RULE) else None

                paginateUrlXpath = ele.attrib[datanode.PAGINATE_URL_XPATH] if ele.attrib.has_key(
                    datanode.PAGINATE_URL_XPATH) else None
                paginateUrlRule = ele.attrib[datanode.PAGINATE_URL_RULE] if ele.attrib.has_key(
                    datanode.PAGINATE_URL_RULE) else None

                maxPage = 0
                url = ''
                for child in loopEles:
                    if None != paginateMaxXpath and '' != paginateMaxXpath:
                        maxPage = parseutil.extractValueByXpath(paginateMaxXpath, task.frontier.getNameSpace(), child)
                        if None != paginateMaxRule and '' != paginateMaxRule:
                            maxPage = parseutil.extractValueByRule(paginateMaxRule, maxPage)
                    else:
                        maxPage = parseutil.extractValueByRule(paginateMaxRule, child.tostring())

                    if None != paginateUrlXpath and '' != paginateUrlXpath:
                        url = parseutil.extractValueByXpath(paginateUrlXpath, task.frontier.getNameSpace(), child)
                        if None != paginateUrlRule and '' != paginateUrlRule:
                            url = parseutil.extractValueByRule(paginateUrlRule, url)
                    else:
                        if None != paginateUrlRule and '' != paginateUrlRule:
                            url = parseutil.extractValueByRule(paginateUrlRule, child.tostring())

                if ('' != maxPage and int(maxPage) > 0) and url != '':

                    for i in range(2, int(maxPage) + 1, 1):
                        nextPageUrl = "http://place.qyer.com" + url + str(i)
                        childTask = Task(task.getFrontier())
                        childTask.setCrawlerEle(task.getCrawlerEle())
                        if task.htmlNode is not None:
                            childTask.htmlNode = task.htmlNode

                        childTask.nextCrawlerUrl = nextPageUrl
                        childTask.parentId = task.parentId
                        childTask.parentNode = task.parentNode
                        childTask.hasPagiNate = True
                        childTask.getFrontier().addTask(childTask)
        except Exception, e:
            print e, "executing PaginateProcess has occurred exception"
Пример #2
0
    def execute(self, task):
        ele = task.crawlerEle
        if ele.tag != datanode.LOOP:
            return
        try:
            loopXpath = ele.attrib[datanode.LOOP_XPATH] if ele.attrib.has_key(datanode.LOOP_XPATH) else None
            loopRule = ele.attrib[datanode.LOOP_RULE] if ele.attrib.has_key(datanode.LOOP_RULE) else None

            if loopXpath != None and loopXpath != '':
                if task.htmlNode != None:
                    loopEles = parseutil.selectNodes(loopXpath, task.frontier.getNameSpace(), task.htmlNode)

                    for loopEle in loopEles:
                        self.createChildren(task, loopEle)
        except Exception, e:
            print e, "executing LoopProcess has occurred exception"
Пример #3
0
    def execute(self, task):
        ele = task.crawlerEle
        if ele.tag != datanode.LOOP:
            return
        try:
            loopXpath = ele.attrib[datanode.LOOP_XPATH] if ele.attrib.has_key(
                datanode.LOOP_XPATH) else None
            loopRule = ele.attrib[datanode.LOOP_RULE] if ele.attrib.has_key(
                datanode.LOOP_RULE) else None

            if loopXpath != None and loopXpath != '':
                if task.htmlNode != None:
                    loopEles = parseutil.selectNodes(
                        loopXpath, task.frontier.getNameSpace(), task.htmlNode)

                    for loopEle in loopEles:
                        self.createChildren(task, loopEle)
        except Exception, e:
            print e, "executing LoopProcess has occurred exception"
Пример #4
0
    def execute(self, task):
        ele = task.crawlerEle
        paginateXpath = ele.attrib[
            datanode.PAGINATE_XPATH] if ele.attrib.has_key(
                datanode.PAGINATE_XPATH) else None
        if paginateXpath == None or task.hasPagiNate:
            return
        try:
            htmlNode = task.htmlNode
            if None != htmlNode:
                loopEles = parseutil.selectNodes(paginateXpath,
                                                 task.frontier.getNameSpace(),
                                                 htmlNode)
                paginateMaxXpath = ele.attrib[
                    datanode.PAGINATE_MAX_XPATH] if ele.attrib.has_key(
                        datanode.PAGINATE_MAX_XPATH) else None
                paginateMaxRule = ele.attrib[
                    datanode.PAGINATE_MAX_RULE] if ele.attrib.has_key(
                        datanode.PAGINATE_MAX_RULE) else None

                paginateUrlXpath = ele.attrib[
                    datanode.PAGINATE_URL_XPATH] if ele.attrib.has_key(
                        datanode.PAGINATE_URL_XPATH) else None
                paginateUrlRule = ele.attrib[
                    datanode.PAGINATE_URL_RULE] if ele.attrib.has_key(
                        datanode.PAGINATE_URL_RULE) else None

                maxPage = 0
                url = ''
                for child in loopEles:
                    if None != paginateMaxXpath and '' != paginateMaxXpath:
                        maxPage = parseutil.extractValueByXpath(
                            paginateMaxXpath, task.frontier.getNameSpace(),
                            child)
                        if None != paginateMaxRule and '' != paginateMaxRule:
                            maxPage = parseutil.extractValueByRule(
                                paginateMaxRule, maxPage)
                    else:
                        maxPage = parseutil.extractValueByRule(
                            paginateMaxRule, child.tostring())

                    if None != paginateUrlXpath and '' != paginateUrlXpath:
                        url = parseutil.extractValueByXpath(
                            paginateUrlXpath, task.frontier.getNameSpace(),
                            child)
                        if None != paginateUrlRule and '' != paginateUrlRule:
                            url = parseutil.extractValueByRule(
                                paginateUrlRule, url)
                    else:
                        if None != paginateUrlRule and '' != paginateUrlRule:
                            url = parseutil.extractValueByRule(
                                paginateUrlRule, child.tostring())

                if ('' != maxPage and int(maxPage) > 0) and url != '':

                    for i in range(2, int(maxPage) + 1, 1):
                        nextPageUrl = "http://place.qyer.com" + url + str(i)
                        childTask = Task(task.getFrontier())
                        childTask.setCrawlerEle(task.getCrawlerEle())
                        if task.htmlNode is not None:
                            childTask.htmlNode = task.htmlNode

                        childTask.nextCrawlerUrl = nextPageUrl
                        childTask.parentId = task.parentId
                        childTask.parentNode = task.parentNode
                        childTask.hasPagiNate = True
                        childTask.getFrontier().addTask(childTask)
        except Exception, e:
            print e, "executing PaginateProcess has occurred exception"