Python MyscraperItem примеры использования

Язык программирования: Python

Пространство имен/Пакет: myscraper.items

Класс/Тип: MyscraperItem

Примеров на hotexamples.com: 6

Python MyscraperItem - 6 примеров найдено. Это лучшие примеры Python кода для myscraper.items.MyscraperItem, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

MyscraperItem(6)

Основные методы

MyscraperItem (6)

Пример #1

Показать файл

Файл: mybots.py Проект: boy0516/first-git

    def parse(self, response):

        titles_raw = response.xpath(
            '//*[@id="main_content"]/div[2]/ul/li/dl/dt/a/text()').extract()
        writers = response.css('.writing::text').extract()
        previews = response.css('.lede::text').extract()

        titles = []
        for title in titles_raw:
            if title.strip():
                titles.append(title.strip())

        for idx in range(len(titles)):
            item = MyscraperItem()
            item['title'] = titles[idx]
            item['writer'] = writers[idx]
            item['preview'] = previews[idx]
            if response.url == "https://news.naver.com/main/list.nhn?mode=LS2D&sid2=263&sid1=101&mid=shm&page=1":
                list1.append(item)
            elif response.url == "https://news.naver.com/main/list.nhn?mode=LS2D&sid2=263&sid1=101&mid=shm&page=2":
                list2.append(item)
            elif response.url == "https://news.naver.com/main/list.nhn?mode=LS2D&sid2=263&sid1=101&mid=shm&page=3":
                list3.append(item)
            elif response.url == "https://news.naver.com/main/list.nhn?mode=LS2D&sid2=263&sid1=101&mid=shm&page=4":
                list4.append(item)
            elif response.url == "https://news.naver.com/main/list.nhn?mode=LS2D&sid2=263&sid1=101&mid=shm&page=5":
                list5.append(item)

        items = []
        for list in [list1, list2, list3, list4, list5]:
            for i in list:
                items.append(i)
        if len(items) == 100:
            return items

Пример #2

Показать файл

Файл: my_scrapper.py Проект: tanvirfahim15/LinkScrapper

    def parse_items(self, response):
        # The list of items that are found on the particular page
        items = []
        # Only extract canonicalized and unique links (with respect to the current page)
        links = LinkExtractor(canonicalize=True,
                              unique=True).extract_links(response)
        # Now go through all the found links
        for link in links:
            # Check whether the domain of the URL of the link is allowed; so whether it is in one of the allowed domains
            is_allowed = False
            for allowed_domain in self.allowed_domains:
                if allowed_domain in link.url:
                    is_allowed = True
            # If it is allowed, create a new item and add it to the list of found items
            if is_allowed:
                item = MyscraperItem()
                item['link'] = link.url
                items.append(item)
                patterns = [
                    "kalerkantho.com/online/national/",
                    "kalerkantho.com/online/Politics/",
                    "kalerkantho.com/online/Court/",
                    "kalerkantho.com/online/world/",
                    "kalerkantho.com/online/business/",
                    "kalerkantho.com/online/sahitya/",
                    "kalerkantho.com/online/sport/",
                    "kalerkantho.com/online/entertainment/",
                    "kalerkantho.com/online/info-tech/",
                    "kalerkantho.com/online/prescription/"
                ]

                file = None
                if patterns[0] in link.url:
                    file = open('../../data/national.csv', 'a')
                if patterns[1] in link.url:
                    file = open('../../data/politics.csv', 'a')
                if patterns[2] in link.url:
                    file = open('../../data/court.csv', 'a')
                if patterns[3] in link.url:
                    file = open('../../data/world.csv', 'a')
                if patterns[4] in link.url:
                    file = open('../../data/business.csv', 'a')
                if patterns[5] in link.url:
                    file = open('../../data/literature.csv', 'a')
                if patterns[6] in link.url:
                    file = open('../../data/sports.csv', 'a')
                if patterns[7] in link.url:
                    file = open('../../data/entertainment.csv', 'a')
                if patterns[8] in link.url:
                    file = open('../../data/tech.csv', 'a')
                if patterns[9] in link.url:
                    file = open('../../data/medical.csv', 'a')

                if file != None:
                    file.write(urlShortener(link.url) + "\n")
                    file.close()

        # Return all the found items
        return items

Пример #3

Показать файл

Файл: mybots.py Проект: shanolulu/python

    def parse(self, response):
        titles = response.xpath('//*[@id="main_content"]/div[2]/ul/li/dl/dt[2]/a/text()').extract() # dt[2] 는 고정값이기 떄문에 [2]를 지워주지 않음
        authors = response.css('.writing::text').extract()
        previews = response.css('.lede::text').extract()

        items = []
        for idx in range(len(titles)):
            item = MyscraperItem()
            item['title'] = titles[idx]
            item['author'] = authors[idx]
            item['preview'] = previews[idx]
            items.append(item)
        return items

Пример #4

Показать файл

Файл: my_scrapper.py Проект: tanvirfahim15/LinkScrapper

    def parse_items(self, response):
        # The list of items that are found on the particular page
        items = []
        # Only extract canonicalized and unique links (with respect to the current page)
        links = LinkExtractor(canonicalize=True,
                              unique=True).extract_links(response)
        # Now go through all the found links
        for link in links:
            # Check whether the domain of the URL of the link is allowed; so whether it is in one of the allowed domains
            is_allowed = False
            for allowed_domain in self.allowed_domains:
                if allowed_domain in link.url:
                    is_allowed = True
            # If it is allowed, create a new item and add it to the list of found items
            if is_allowed:
                item = MyscraperItem()
                item['link'] = link.url
                items.append(item)
                patterns = [
                    "banglatribune.com/sport/news/",
                    "banglatribune.com/business/news/",
                    "banglatribune.com/entertainment/news/",
                    "banglatribune.com/country/news/",
                    "banglatribune.com/foreign/news/",
                    "banglatribune.com/tech-and-gadget/news/",
                    "banglatribune.com/literature/news/"
                ]

                file = None
                if patterns[0] in link.url:
                    file = open('../../data/sports.csv', 'a')
                if patterns[1] in link.url:
                    file = open('../../data/economy.csv', 'a')
                if patterns[2] in link.url:
                    file = open('../../data/entertainment.csv', 'a')
                if patterns[3] in link.url:
                    file = open('../../data/bangladesh.csv', 'a')
                if patterns[4] in link.url:
                    file = open('../../data/international.csv', 'a')
                if patterns[5] in link.url:
                    file = open('../../data/technology.csv', 'a')
                if patterns[6] in link.url:
                    file = open('../../data/literature.csv', 'a')

                if file != None:
                    file.write(urlShortener(link.url) + "\n")
                    file.close()

        # Return all the found items
        return items

Пример #5

Показать файл

    def parse(self, response):
        titles = response.xpath('//*[@id="main_content"]/div[2]/ul/li/dl/dt[2]/a/text()').extract()
        writers = response.css('.writing::text').extract()
        previews = response.css('.lede::text').extract()

        #zip(titles, writers, previews)
        items = []
        # items에 XPATH, CSS를 통해 추출한 데이터를 저장
        for idx in range(len(titles)):
            item = MyscraperItem()
            item['title'] = titles[idx]
            item['writer'] = writers[idx]
            item['preview'] = previews[idx]

            items.append(item)

        return items

Пример #6

Показать файл

 def parse_items(self, response):
     # The list of items that are found on the particular page
     items = []
     # Only extract canonicalized and unique links (with respect to the current page)
     links = LinkExtractor(canonicalize=True,
                           unique=True).extract_links(response)
     # Now go through all the found links
     for link in links:
         # Check whether the domain of the URL of the link is allowed; so whether it is in one of the allowed domains
         is_allowed = False
         for allowed_domain in self.allowed_domains:
             if allowed_domain in link.url:
                 is_allowed = True
         # If it is allowed, create a new item and add it to the list of found items
         if is_allowed:
             item = MyscraperItem()
             item['url_from'] = response.url
             item['url_to'] = link.url
             items.append(item)
     # Return all the found items
     return items