Exemplo n.º 1
0
    def getTotalPage(self,response):
        
        article_urls = Selector(text=response.body.decode("gbk")).xpath(self.xpath["article_link"]).extract()
        #article_urls = Selector(text=response.body.decode("gbk")).xpath('//td[@class="unnamed1"]/a[contains(@href,"fileview")]/@href').extract()
        
        if len(article_urls) > 0:

            for url in article_urls:

                result = urlparse.urlparse(url)
                params = urlparse.parse_qs(result.query,True)
                
                meta=response.meta
                meta['title'] = str(params['title'][0]) if "title" in params else ""
                meta['author'] = str(params['name'][0]) if "name" in params else ""
                if "date" in params:
                    date_str = str(params['date'][0])
                    meta['publish_time'] = '%s-%s-%s' % (date_str[0:4],date_str[4:6],date_str[6:8])
                
                if response.url.find("ld_history_jianghua") != -1:
                    meta['url'] = self.url_prefix1 + url
                    url = self.url_prefix1 + url.encode("gbk")
                else:
                    meta['url'] = self.url_prefix2 + url
                    url = self.url_prefix2 + url.encode("gbk")
                
                if isArticleExist(url) == False:
                    yield Request(url, callback=self.getContent, meta=meta)
Exemplo n.º 2
0
    def getTotalPage(self, response):

        meta = response.meta
        #meta['dont_redirect'] = True

        article_urls = response.xpath(self.xpath['article_link']).extract()
        #article_urls = response.xpath('//ul[@class="bottom_ul" or @class="news_ul"]/li/a/@href').extract()

        if len(article_urls) > 0:
            for url in article_urls:
                if url.startswith("../"):
                    url = self.url_prefix + url.strip("../")

                if not url.startswith("http://"):
                    index = response.url.rfind("/")
                    url = response.url[0:index + 1] + url

                meta['url'] = url
                if isArticleExist(url) == False:
                    yield Request(url, callback=self.getContent, meta=meta)

        next_page_urls = response.xpath(self.xpath['next_page_link']).extract()
        #next_page_urls = response.xpath('//div[@id="displaypagenum"]//a[contains(text(),">>")]/@href').extract()
        if len(next_page_urls) > 0:
            index = response.url.rfind("/")
            next_page_url = response.url[0:index + 1] + next_page_urls[0]
            #             print next_page_url
            yield Request(next_page_url, callback=self.getTotalPage, meta=meta)
Exemplo n.º 3
0
    def getArticleUrl(self, response):

        meta = response.meta

        if response.url.find("gkml") != -1:

            article_urls = response.xpath(
                self.xpath["article_link_GKML"]).extract()
            #article_urls = response.xpath('//div[@id="documentContainer"]/div[@class="row"]/li[@class="mc"]//a/@href').extract()

            if len(article_urls) > 0:
                for url in article_urls:
                    #for url in article_urls:
                    if url.startswith("../../"):
                        url = self.url_prefix + url.lstrip("../../")

                    if not url.startswith("http://"):
                        continue

                    meta['url'] = url
                    if isArticleExist(url) == False:
                        yield Request(url,
                                      callback=self.getContentGKML,
                                      meta=meta)
        else:
            article_urls = response.xpath(
                self.xpath["article_link_SYrlzyhshbzb"]).extract()
            #article_urls = response.xpath('//div[@class="serviceMainListConType"]/div/div[@class="serviceMainListTxt"]/span/a/@href').extract()

            if len(article_urls) > 0:
                for url in article_urls:

                    index = response.url.rfind("/")
                    if url.startswith("./"):
                        url = response.url[0:index] + url.lstrip(".")

                    if not url.startswith("http://"):
                        continue

                    meta['url'] = url
                    if isArticleExist(url) == False:
                        yield Request(url,
                                      callback=self.getContentSYrlzyhshbzb,
                                      meta=meta)
Exemplo n.º 4
0
    def getArticleUrl(self, response):

        meta = response.meta
        article_urls = response.xpath(self.xpath['article_link']).extract()
        #article_urls = response.xpath('//table[@width="610"]//tr/td/a/@href').extract()

        if len(article_urls) > 0:
            for url in article_urls:
                url = response.url + url.strip("./")

                meta['url'] = url
                if isArticleExist(url) == False:
                    yield Request(url, callback=self.getContent, meta=meta)
Exemplo n.º 5
0
 def getArticleUrl(self,response):
     
     meta = response.meta
     article_urls = response.xpath(self.xpath["article_link"]).extract()
     #article_urls = response.xpath('(//div[@class="newsList"])//div[@class="title"]/a/@href').extract()
     
     #print response.url, len(article_urls)
     
     if len(article_urls) > 0:
         for url in article_urls:
             url = self.url_prefix + url
             
             meta['url'] = url
             if isArticleExist(url) == False:
                 yield Request(url, callback=self.getContent, meta=meta)
Exemplo n.º 6
0
    def getArticleUrl(self, response):

        article_urls = response.xpath(self.xpath["article_link"]).extract()
        #article_urls = response.xpath('//ul[@id="ContentPlaceHolder1_MainMiddleControl1_WebPageDocumentsByUId1"]/li/div[@class="m_sub"]/a/@href').extract()

        if len(article_urls) > 0:
            for url in article_urls:

                if not url.startswith("http://"):
                    continue

                if isArticleExist(url) == False:
                    yield Request(url,
                                  callback=self.getContent,
                                  meta=response.meta)
Exemplo n.º 7
0
    def getArticlesID(self, response):
        meta = response.meta

        article_ids = response.xpath(self.xpath["article_id"]).extract()
        #article_ids = response.xpath('//d/r1/id/text()').extract()

        for article_id in article_ids:

            article_url = self.url_prefix + article_id

            if isArticleExist(article_url) == False:

                url = "http://ipub.exuezhe.com/Qk/GetTextArt?id=%s&pn=1&ps=100" % article_id

                meta["url"] = article_url
                meta["article_id"] = article_id

                yield Request(url, callback=self.getContent, meta=meta)
Exemplo n.º 8
0
    def getArticleUrl(self, response):

        meta = response.meta

        article_urls = response.xpath(self.xpath["article_link"]).extract()
        #article_urls = response.xpath('//div[@class="f-main-leftMain-content clear"]//ol/li/a/@href').extract()

        if len(article_urls) > 0:
            for url in article_urls:
                if url.startswith("./"):
                    url = response.meta['url_prefix'] + url.lstrip("./")

                if not url.startswith("http://"):
                    continue

                meta['url'] = url
                if isArticleExist(url) == False:
                    yield Request(url,
                                  callback=self.getContent,
                                  meta=meta,
                                  dont_filter=True)
Exemplo n.º 9
0
    def getArticleUrl(self, response):

        meta = response.meta

        #跳转后的页面
        current_page_no = self.getPageNo(response.url)
        if current_page_no != meta['page_no']:
            yield Request(response.url,
                          callback=self.getTotalPage,
                          meta=response.meta)

        article_urls = response.xpath(self.xpath['article_link']).extract()
        #article_urls = response.xpath('//div[@id="Content1"]/div[@class="xin"]/ul/li/span/a/@href').extract()

        #print response.url, len(article_urls)

        if len(article_urls) > 0:
            for url in article_urls:
                url = self.url_prefix + url

                meta['url'] = url
                if isArticleExist(url) == False:
                    yield Request(url, callback=self.getContent, meta=meta)