예제 #1
0
 def store_response(self, spider, request, response):
     data = {
         'status': response.status,
         'domain': get_domain(response.url),
         'url': response.url,
         'headers': self._clean_headers(response.headers),
         'html': response.body,
     }
     self.db[self.COLLECTION_NAME].insert_one(data)
 def store_response(self, spider, request, response):
     data = {
         'status':
         response.status,
         'domain':
         get_domain(response.url),
         'url':
         response.url,
         'html':
         str(response.body).lstrip("b'").strip("'").replace(
             "\\n", "").replace("\\t", "").replace("\\\\", "\\"),
         'created':
         datetime.now()
     }
     data.update(
         self._flatten_headers(self._clean_headers(response.headers)))
     WebLink(meta={'id': get_urn(response.url)}, **data).save()
예제 #3
0
    def parse_node(self, response, node):
        title = self.get_or_none(node.select('title/text()'))

        url = self.get_or_none(node.select('link/text()'))
        description = self.get_or_none(node.select('description/text()'))
        pub_date = self.get_or_none(node.select('pubDate/text()'))
        category = self.get_or_none(node.select('category/text()'))
        # image = node.select('item/media:content/url')

        item = {}
        item['title'] = title
        item['url'] = url

        item['pub_date'] = pub_date
        item['category'] = category
        item['description'] = description
        item['domain'] = get_domain(response.url)
        return item
예제 #4
0
    def parse(self, response):
        # parse downloaded content with feedparser (NOT re-downloading with feedparser)
        feed = self.parse_feed(response.body)
        if feed:
            # grab some feed elements
            # - https://pythonhosted.org/feedparser/common-rss-elements.html
            # - https://pythonhosted.org/feedparser/common-atom-elements.html

            # ns = feed.namespaces
            # feed_title = feed.feed.title
            # feed_link = feed.feed.link
            # feed_desc = feed.feed.description

            for entry in feed.entries:
                # have content?
                content = entry.get('content')
                if content:
                    # content = content[0]
                    content = content[0]['value']

                item = {
                    # global feed data
                    # 'feed_title': feed_title,
                    # 'feed_link': feed_link,
                    # 'feed_description': feed_desc,
                    #
                    # item entry data
                    # 'url': response.url,
                    'url': entry.link,
                    'title': entry.title,
                    'domain': get_domain(response.url),
                    'description': entry.description,
                    # 'date': entry.published,
                    # 'date': entry.published_parsed,
                    'pub_date': entry.updated_parsed,

                    # optional
                    'content': content,
                    'type': entry.get('dc_type'),
                }

                yield item
예제 #5
0
파일: solr.py 프로젝트: kirschd/web-crawler
    def store_response(self, spider, request, response):
        data = {
            'status':
            response.status,
            'domain':
            get_domain(response.url),
            'url':
            response.url,
            'html':
            str(response.body).lstrip("b'").strip("'").replace(
                "\\n", "").replace("\\t", "").replace("\\\\", "\\"),
            'created':
            datetime.now()
        }
        data.update(
            self._flatten_headers(self._clean_headers(response.headers)))

        data = self.map_to_solr_datatypes(data=data)
        data['id'] = get_urn(response.url)
        self.solr.add([data])