Пример #1
0
    def parse(self, response):

        titles_raw = response.xpath(
            '//*[@id="main_content"]/div[2]/ul/li/dl/dt/a/text()').extract()
        writers = response.css('.writing::text').extract()
        previews = response.css('.lede::text').extract()

        titles = []
        for title in titles_raw:
            if title.strip():
                titles.append(title.strip())

        for idx in range(len(titles)):
            item = MyscraperItem()
            item['title'] = titles[idx]
            item['writer'] = writers[idx]
            item['preview'] = previews[idx]
            if response.url == "https://news.naver.com/main/list.nhn?mode=LS2D&sid2=263&sid1=101&mid=shm&page=1":
                list1.append(item)
            elif response.url == "https://news.naver.com/main/list.nhn?mode=LS2D&sid2=263&sid1=101&mid=shm&page=2":
                list2.append(item)
            elif response.url == "https://news.naver.com/main/list.nhn?mode=LS2D&sid2=263&sid1=101&mid=shm&page=3":
                list3.append(item)
            elif response.url == "https://news.naver.com/main/list.nhn?mode=LS2D&sid2=263&sid1=101&mid=shm&page=4":
                list4.append(item)
            elif response.url == "https://news.naver.com/main/list.nhn?mode=LS2D&sid2=263&sid1=101&mid=shm&page=5":
                list5.append(item)

        items = []
        for list in [list1, list2, list3, list4, list5]:
            for i in list:
                items.append(i)
        if len(items) == 100:
            return items
Пример #2
0
    def parse_items(self, response):
        # The list of items that are found on the particular page
        items = []
        # Only extract canonicalized and unique links (with respect to the current page)
        links = LinkExtractor(canonicalize=True,
                              unique=True).extract_links(response)
        # Now go through all the found links
        for link in links:
            # Check whether the domain of the URL of the link is allowed; so whether it is in one of the allowed domains
            is_allowed = False
            for allowed_domain in self.allowed_domains:
                if allowed_domain in link.url:
                    is_allowed = True
            # If it is allowed, create a new item and add it to the list of found items
            if is_allowed:
                item = MyscraperItem()
                item['link'] = link.url
                items.append(item)
                patterns = [
                    "kalerkantho.com/online/national/",
                    "kalerkantho.com/online/Politics/",
                    "kalerkantho.com/online/Court/",
                    "kalerkantho.com/online/world/",
                    "kalerkantho.com/online/business/",
                    "kalerkantho.com/online/sahitya/",
                    "kalerkantho.com/online/sport/",
                    "kalerkantho.com/online/entertainment/",
                    "kalerkantho.com/online/info-tech/",
                    "kalerkantho.com/online/prescription/"
                ]

                file = None
                if patterns[0] in link.url:
                    file = open('../../data/national.csv', 'a')
                if patterns[1] in link.url:
                    file = open('../../data/politics.csv', 'a')
                if patterns[2] in link.url:
                    file = open('../../data/court.csv', 'a')
                if patterns[3] in link.url:
                    file = open('../../data/world.csv', 'a')
                if patterns[4] in link.url:
                    file = open('../../data/business.csv', 'a')
                if patterns[5] in link.url:
                    file = open('../../data/literature.csv', 'a')
                if patterns[6] in link.url:
                    file = open('../../data/sports.csv', 'a')
                if patterns[7] in link.url:
                    file = open('../../data/entertainment.csv', 'a')
                if patterns[8] in link.url:
                    file = open('../../data/tech.csv', 'a')
                if patterns[9] in link.url:
                    file = open('../../data/medical.csv', 'a')

                if file != None:
                    file.write(urlShortener(link.url) + "\n")
                    file.close()

        # Return all the found items
        return items
Пример #3
0
    def parse(self, response):
        titles = response.xpath('//*[@id="main_content"]/div[2]/ul/li/dl/dt[2]/a/text()').extract() # dt[2] 는 고정값이기 떄문에 [2]를 지워주지 않음
        authors = response.css('.writing::text').extract()
        previews = response.css('.lede::text').extract()

        items = []
        for idx in range(len(titles)):
            item = MyscraperItem()
            item['title'] = titles[idx]
            item['author'] = authors[idx]
            item['preview'] = previews[idx]
            items.append(item)
        return items
Пример #4
0
    def parse_items(self, response):
        # The list of items that are found on the particular page
        items = []
        # Only extract canonicalized and unique links (with respect to the current page)
        links = LinkExtractor(canonicalize=True,
                              unique=True).extract_links(response)
        # Now go through all the found links
        for link in links:
            # Check whether the domain of the URL of the link is allowed; so whether it is in one of the allowed domains
            is_allowed = False
            for allowed_domain in self.allowed_domains:
                if allowed_domain in link.url:
                    is_allowed = True
            # If it is allowed, create a new item and add it to the list of found items
            if is_allowed:
                item = MyscraperItem()
                item['link'] = link.url
                items.append(item)
                patterns = [
                    "banglatribune.com/sport/news/",
                    "banglatribune.com/business/news/",
                    "banglatribune.com/entertainment/news/",
                    "banglatribune.com/country/news/",
                    "banglatribune.com/foreign/news/",
                    "banglatribune.com/tech-and-gadget/news/",
                    "banglatribune.com/literature/news/"
                ]

                file = None
                if patterns[0] in link.url:
                    file = open('../../data/sports.csv', 'a')
                if patterns[1] in link.url:
                    file = open('../../data/economy.csv', 'a')
                if patterns[2] in link.url:
                    file = open('../../data/entertainment.csv', 'a')
                if patterns[3] in link.url:
                    file = open('../../data/bangladesh.csv', 'a')
                if patterns[4] in link.url:
                    file = open('../../data/international.csv', 'a')
                if patterns[5] in link.url:
                    file = open('../../data/technology.csv', 'a')
                if patterns[6] in link.url:
                    file = open('../../data/literature.csv', 'a')

                if file != None:
                    file.write(urlShortener(link.url) + "\n")
                    file.close()

        # Return all the found items
        return items
Пример #5
0
    def parse(self, response):
        titles = response.xpath('//*[@id="main_content"]/div[2]/ul/li/dl/dt[2]/a/text()').extract()
        writers = response.css('.writing::text').extract()
        previews = response.css('.lede::text').extract()

        #zip(titles, writers, previews)
        items = []
        # items에 XPATH, CSS를 통해 추출한 데이터를 저장
        for idx in range(len(titles)):
            item = MyscraperItem()
            item['title'] = titles[idx]
            item['writer'] = writers[idx]
            item['preview'] = previews[idx]

            items.append(item)

        return items
Пример #6
0
 def parse_items(self, response):
     # The list of items that are found on the particular page
     items = []
     # Only extract canonicalized and unique links (with respect to the current page)
     links = LinkExtractor(canonicalize=True,
                           unique=True).extract_links(response)
     # Now go through all the found links
     for link in links:
         # Check whether the domain of the URL of the link is allowed; so whether it is in one of the allowed domains
         is_allowed = False
         for allowed_domain in self.allowed_domains:
             if allowed_domain in link.url:
                 is_allowed = True
         # If it is allowed, create a new item and add it to the list of found items
         if is_allowed:
             item = MyscraperItem()
             item['url_from'] = response.url
             item['url_to'] = link.url
             items.append(item)
     # Return all the found items
     return items