예제 #1
0
    def parse(self, response):
        item = MycrawlerItem()
        for news in response.xpath('//div[@class="region region-promoted"]'):
            item['URL'] = response.url
            item['TITLE'] = news.xpath(
                './/div[@class="views-field views-field-title text-justify"]/span/a/text()'
            ).extract_first()
            item['BODY'] = news.xpath(
                './/div[@class="views-field views-field-body text-justify  padding-right-zero"]/span/text()'
            ).extract_first()
            item['DATE'] = news.xpath(
                './/div[@class="views-field views-field-field-date-publishing"]/div/text()'
            ).extract_first()
            yield item

        for news in response.xpath('//div[@id="block-system-main"]//li'):
            item['URL'] = response.url
            item['TITLE'] = news.xpath(
                './div[2]/span/a/text()').extract_first()
            item['BODY'] = news.xpath('./div[3]/span/text()').extract_first()
            item['DATE'] = news.xpath('./div[4]/div/text()').extract_first()
            yield item
        for news in response.xpath(
                '//div[@id="content-footer-inside"]//ul[@class="item"]/li'):
            item['URL'] = response.url
            item['TITLE'] = news.xpath(
                './div[2]/span/a/text()').extract_first()
            item['BODY'] = news.xpath('./div[3]/span/text()').extract_first()
            item['DATE'] = news.xpath('./div[4]/div/text()').extract()
            yield item
예제 #2
0
    def parse(self, response):
        item = MycrawlerItem()
        all_News = response.xpath('//a[@class="title-link"]/ancestor::div[1]')

        for news in all_News:
            relative_url = news.xpath('a/@href').extract_first()
            absolute_url = response.urljoin(relative_url)
            title = news.xpath('./a//span/text()').extract_first()
            date = news.xpath(
                './/li[@class="mini-info-list__item"]/div[@data-datetime]/text()'
            ).extract_first()

            yield Request(absolute_url,
                          callback=self.parse_page,
                          meta={
                              'URL': absolute_url,
                              'TITLE': title,
                              'DATE': date
                          })

    #  relative_next_url = response.xpath('//a[@class="button next"]/@href').extract_first()
    #  absolute_next_url = "https://newyork.craigslist.org" + relative_next_url

        next_page_url = response.xpath(
            '//li[@class="next"]/a/@href').extract_first()
        if next_page_url is not None:
            yield scrapy.Request(response.urljoin(next_page_url))
예제 #3
0
def parse_page(response):
    item = MycrawlerItem()
    item['URL'] = response.meta.get('URL')
    item['TITLE'] = response.meta.get('TITLE')
    item['DATE'] = response.meta.get('DATE')
    item['BODY'] = response.xpath(
        '//div[@class="lts-txt2"]/text() | //div[@class="lts-txt2"]/div/text() | //div[@class="lts-txt2"]/p/text() | //div[@class="lts-txt2"]/div/p'
    ).extract_first()
    yield item
예제 #4
0
def parse_sub_news(response):
    item = MycrawlerItem()
    item['URL'] = response.meta.get('URL')
    item['TITLE'] = response.xpath(
        '//div[@class="main-news-heading visible-xs visible-sm"]/h1/text()'
    ).extract_first
    item['DATE'] = response.meta.get('DATE')
    item['BODY'] = response.xpath(
        '//div[@class="lts-txt2"]/text()').extract_first()
    return item
예제 #5
0
def parse_main_news(response):
    item = MycrawlerItem()
    item['URL'] = response.meta.get('URL')
    item['TITLE'] = response.xpath(
        './/div[@class="main-news-heading visible-xs visible-sm"]/h1/text()'
    ).extract_first()
    item['DATE'] = response.meta.get('DATE')
    item['BODY'] = response.xpath(
        '//div[@class="text-left w-300 editor-styles"]/p/text()'
    ).extract_first()
    return item
예제 #6
0
    def parse(self, response):
        item = MycrawlerItem()
        for news in response.xpath('//a[@class="title-link"]/ancestor::div[1]'):
            item['URL'] = response.url
            item['TITLE'] = news.xpath('./a/h3[@class="title-link__title"]/span/text()').extract_first()
            item['BODY'] = news.xpath('./p[@class="eagle-item__summary"]/text()').extract_first()
            item['DATE'] = news.xpath('.//li[@class="mini-info-list__item"]/div[@data-datetime]/text()').extract_first()
            yield item
        # yield: collected as item, and it's come as item of output file, inforamtion need to collected need to include
        next_page_url = response.xpath('//li[@class="next"]/a/@href').extract_first()
        if next_page_url is not None:
            yield scrapy.Request(response.urljoin(next_page_url))

        return item
예제 #7
0
    def parse(self, response):
        item = MycrawlerItem()
        for news in response.xpath('//div[@class="article-content"]'):
            item['URL'] = response.url
            item['TITLE'] = news.xpath('./h2/a/text()').extract_first()
            item['BODY'] = news.xpath('./p/text()').extract_first()
            item['DATE'] = news.xpath('./span/a/text()').extract()
            yield item
        # yield: collected as item, and it's come as item of output file, inforamtion need to collected need to include
        # next_page_url = response.xpath('//a[contains(text(),"Next Page")]/@href').extract_first()
        # if next_page_url is not None:
        #     yield scrapy.Request(response.urljoin(next_page_url))

        return item
예제 #8
0
    def parse(self, response):
        item = MycrawlerItem()
        for news in response.xpath('//div[@class="story-text"]'):
            item['URL'] = response.url
            item['TITLE'] = news.xpath('.//h4/a/text()').extract_first()
            item['BODY'] = news.xpath('.//p/text()').extract_first()
            item['DATE'] = news.xpath('.//div/span/text()').extract_first()
            yield item
        # yield: collected as item, and it's come as item of output file, inforamtion need to collected need to include
        #nextPages = response.xpath("//ul[@class='pagination']/li/a/@href")
        #for nextPage in nextPages:
         #   next_page_url = response.urljoin(nextPage.extract())
          #  if next_page_url is not None:
           #     yield scrapy.Request(response.urljoin(next_page_url))

        return item
예제 #9
0
    def parse(self, response):
        item = MycrawlerItem()
        for news in response.xpath('//div[@class="col-md-6 cat-ite"]'):
            item['URL'] = response.url
            item['TITLE'] = news.xpath('.//div[2]/a/text()').extract_first()
            item['BODY'] = news.xpath('.//div[3]/text()').extract_first()
            # item['DATE'] = news.xpath('').extract_first()
            yield item
        # yield: collected as item, and it's come as item of output file, inforamtion need to collected need to include
        nextPages = response.xpath(
            "//div[@class='page-nation']//li/a[contains(text(),'Next')]/@href")
        for nextPage in nextPages:
            next_page_url = response.urljoin(nextPage.extract())
            if next_page_url is not None:
                yield scrapy.Request(response.urljoin(next_page_url))

        return item
예제 #10
0
    def parse(self, response):
        item = MycrawlerItem()
        for news in response.xpath('//article'):
            item['URL'] = response.url
            item['TITLE'] = news.xpath('./text()').extract_first()
            item['BODY'] = news.xpath(
                './/div[@class="entry-content"]/text()').extract_first()
            item['DATE'] = news.xpath(
                './/span[@class="entry-date post-date"]/abbr/text()').extract(
                )
            yield item
        # yield: collected as item, and it's come as item of output file, inforamtion need to collected need to include
        nextPages = response.xpath(
            "//div[@class='wp-pagenavi iegradient']/a/@href")
        for nextPage in nextPages:
            next_page_url = response.urljoin(nextPage.extract())
            if next_page_url is not None:
                yield scrapy.Request(response.urljoin(next_page_url))

        return item
예제 #11
0
파일: ssmy.py 프로젝트: Witure/crawl
    def parse(self, response):
        # data_list = []
        div_list = response.xpath(
            "/html/body/main/div/div/div[1]/div/div[2]/div[1]")
        """
        /html/body/main/div/div/div[1]/div/div[2]/div[1]
        /html/body/main/div/div/div[1]/div/div[2]/div[1]/div[1]/div
        /html/body/main/div/div/div[1]/div/div[2]/div[1]/div[4]/div
        /html/body/main/div/div/div[1]/div/div[2]/div[1]/div[3]/div/div/div[1]/a
        """

        # print(div_list)
        for div in div_list:
            # 注意:xpath返回的列表中的列表元素是selector对象,我们要解析获取的字符串的数据是存储在该对象中的,必须经过一个 extract()的操作才可以将该对象中存储的字符串的的数据获取
            content = div.xpath("./div/div/div/div[1]/a/text()").extract()
            item = MycrawlerItem()
            item["content"] = content
            # xpath返回的列表元素有多个, 想要将每一个列表元素对应的字符串取出该如何操作
            print(content)
            print(item)

            yield item