示例#1
0
 def store_response(self, spider, request, response):
     data = {
         'status': response.status,
         'domain': get_domain(response.url),
         'url': response.url,
         'headers': self._clean_headers(response.headers),
         'html': response.body,
     }
     self.db[self.collection_name].insert_one(data)
示例#2
0
    def run(self):
        data = {}

        meta_tags_data = MetaTagExtractor(
            response=self.response,
            extractor=self.extractor,
            extractor_id=self.extractor_id).run().get(self.extractor_id, {})

        paragraphs_data = ParagraphsExtractor(
            response=self.response,
            extractor=self.extractor,
            extractor_id="paragraphs").run().get("paragraphs", {})
        # TODO - clean the data, that is extracted. ex:  meta_tags_data.get("title")
        extracted_data = {
            "title":
            meta_tags_data.get("title") or meta_tags_data.get("meta__title")
            or meta_tags_data.get("og__title")
            or meta_tags_data.get("fb__title")
            or meta_tags_data.get("meta__twitter__title"),
            "description":
            meta_tags_data.get("description")
            or meta_tags_data.get("meta__description")
            or meta_tags_data.get("og__description")
            or meta_tags_data.get("fb__description")
            or meta_tags_data.get("meta__twitter__description"),
            "image":
            meta_tags_data.get("image") or meta_tags_data.get("meta__image")
            or meta_tags_data.get("og__image")
            or meta_tags_data.get("fb__image")
            or meta_tags_data.get("meta__twitter__image"),
            "url":
            meta_tags_data.get("url") or meta_tags_data.get("meta__url")
            or meta_tags_data.get("og__url") or meta_tags_data.get("fb__url")
            or meta_tags_data.get("meta__twitter__url"),
            "page_type":
            meta_tags_data.get("og__type"),
            "keywords":
            meta_tags_data.get("meta__keywords"),
            "domain":
            get_domain(self.response.url),
            "first_paragraph":
            paragraphs_data[0] if len(paragraphs_data) > 0 else None,
            "shortlink_url":
            self.response.xpath('//link[@rel="shortlink"]').xpath(
                "@href").extract_first(),
            "canonical_url":
            self.response.xpath('//link[@rel="canonical"]').xpath(
                "@href").extract_first()
        }
        data[self.extractor_id] = extracted_data
        return data
示例#3
0
 def store_response(self, spider, request, response):
     data = {
         'status':
         response.status,
         'domain':
         get_domain(response.url),
         'url':
         response.url,
         'html':
         str(response.body).lstrip("b'").strip("'").replace(
             "\\n", "").replace("\\t", "").replace("\\\\", "\\"),
         'created':
         datetime.now()
     }
     data.update(
         self._flatten_headers(self._clean_headers(response.headers)))
     self.WebLink(meta={'id': get_urn(response.url)}, **data).save()
示例#4
0
    def parse_node(self, response, node):
        title = self.get_or_none(node.select('title/text()'))

        url = self.get_or_none(node.select('link/text()'))
        description = self.get_or_none(node.select('description/text()'))
        pub_date = self.get_or_none(node.select('pubDate/text()'))
        category = self.get_or_none(node.select('category/text()'))
        # image = node.select('item/media:content/url')

        item = {}
        item['title'] = title
        item['url'] = url

        item['pub_date'] = pub_date
        item['category'] = category
        item['description'] = description
        item['domain'] = get_domain(response.url)
        return item
示例#5
0
    def parse(self, response):
        # parse downloaded content with feedparser (NOT re-downloading with feedparser)
        feed = self.parse_feed(response.body)
        if feed:
            # grab some feed elements
            # - https://pythonhosted.org/feedparser/common-rss-elements.html
            # - https://pythonhosted.org/feedparser/common-atom-elements.html

            # ns = feed.namespaces
            # feed_title = feed.feed.title
            # feed_link = feed.feed.link
            # feed_desc = feed.feed.description

            for entry in feed.entries:
                # have content?
                content = entry.get('content')
                if content:
                    # content = content[0]
                    content = content[0]['value']

                item = {
                    # global feed data
                    # 'feed_title': feed_title,
                    # 'feed_link': feed_link,
                    # 'feed_description': feed_desc,
                    #
                    # item entry data
                    # 'url': response.url,
                    'url': entry.link,
                    'title': entry.title,
                    'domain': get_domain(response.url),
                    'description': entry.description,
                    # 'date': entry.published,
                    # 'date': entry.published_parsed,
                    'pub_date': entry.updated_parsed,

                    # optional
                    'content': content,
                    'type': entry.get('dc_type'),
                }

                yield item
示例#6
0
    def run(self):
        data = {}
        extracted_data = AllLinksExtractor(
            response=self.response,
            extractor=self.extractor,
            extractor_id="all_links"
        ).run().get("all_links", {})

        links_data = {}
        for link in extracted_data:
            domain = get_domain(link)
            if domain in links_data.keys():
                links_data[domain].append(link)
            else:
                links_data[domain] = [link]

        data[self.extractor_id] = [{"domain": domain, "links": domain_links, "links_count": domain_links.__len__()} for
                                domain, domain_links in links_data.items()]

        return data
示例#7
0
文件: solr.py 项目: dvlop/invana-bot
    def store_response(self, spider, request, response):
        data = {
            'status':
            response.status,
            'domain':
            get_domain(response.url),
            'url':
            response.url,
            'html':
            str(response.body).lstrip("b'").strip("'").replace(
                "\\n", "").replace("\\t", "").replace("\\\\", "\\"),
            'created':
            datetime.now()
        }
        data.update(
            self._flatten_headers(self._clean_headers(response.headers)))

        data = self.map_to_solr_datatypes(data=data)
        data['id'] = self.clean_str(get_urn(response.url))

        self.solr.add([data])
示例#8
0
    def parse(self, response=None):

        current_crawler = response.meta.get("current_crawler")
        crawlers = response.meta.get("crawlers")
        context = self.context

        if None in [crawlers, current_crawler]:
            current_crawler = self.current_crawler
            crawlers = self.crawlers

        data = {}
        for extractor in current_crawler['parsers']:
            extracted_data = self.run_extractor(response=response,
                                                extractor=extractor)
            data.update(extracted_data)

        if context is not None:
            data.update({"context": context})
        data['url'] = response.url
        data['domain'] = get_domain(response.url)
        data['context']['crawler_id'] = current_crawler['crawler_id']
        yield data

        for traversal in current_crawler.get('traversals', []):
            if traversal['traversal_type'] == "pagination":
                # TODO - move this to run_pagination_traversal(self, response=None, traversal=None) method;
                traversal_config = traversal['pagination']
                next_crawler_id = traversal['next_crawler_id']
                max_pages = traversal_config.get("max_pages", 1)
                current_page_count = response.meta.get('current_page_count', 1)
                if current_page_count < max_pages:
                    next_selector = traversal_config.get('selector')
                    if next_selector:
                        if traversal_config.get('selector_type') == 'css':
                            next_page = response.css(
                                next_selector +
                                "::attr(href)").extract_first()
                        elif traversal_config.get('selector_type') == 'xpath':
                            next_page = response.xpath(
                                next_selector +
                                "::attr(href)").extract_first()
                        else:
                            next_page = None
                        current_page_count = current_page_count + 1
                        if next_page:
                            if not "://" in next_page:
                                next_page_url = "https://" + get_domain(
                                    response.url) + next_page
                            else:
                                next_page_url = next_page
                            next_crawler = get_crawler_from_list(
                                crawler_id=next_crawler_id, crawlers=crawlers)
                            yield scrapy.Request(next_page_url,
                                                 callback=self.parse,
                                                 meta={
                                                     "current_page_count":
                                                     current_page_count,
                                                     "current_crawler":
                                                     next_crawler,
                                                     "crawlers": crawlers
                                                 })
            elif traversal['traversal_type'] == TRAVERSAL_LINK_FROM_FIELD:
                next_crawler_id = traversal['next_crawler_id']
                traversal_config = traversal[TRAVERSAL_LINK_FROM_FIELD]

                subdocument_key = self.get_subdocument_key(
                    crawler=current_crawler,
                    parser_id=traversal_config['parser_id'])
                for item in data.get(traversal_config['parser_id']).get(
                        subdocument_key, []):
                    traversal_url = item[traversal[TRAVERSAL_LINK_FROM_FIELD]
                                         ['selector_id']]
                    if traversal_url:
                        if "://" not in traversal_url:  # TODO - fix this monkey patch
                            url_parsed = urlparse(response.url)
                            traversal_url = url_parsed.scheme + "://" + url_parsed.netloc + "/" + traversal_url.lstrip(
                                "/")

                        next_crawler = get_crawler_from_list(
                            crawler_id=next_crawler_id, crawlers=crawlers)
                        yield scrapy.Request(traversal_url,
                                             callback=self.parse,
                                             meta={
                                                 "crawlers": crawlers,
                                                 "current_crawler":
                                                 next_crawler,
                                             })
                    else:
                        print("ignoring traversal to {}".format(traversal_url))
            elif traversal['traversal_type'] == TRAVERSAL_SAME_DOMAIN_FIELD:
                all_urls = response.css("a::attr(href)").extract()
                filtered_urls = []
                all_urls = list(set(all_urls))
                current_domain = get_domain(response.url)
                for url in all_urls:
                    url = get_absolute_url(url=url, origin_url=response.url)
                    if get_domain(url) == current_domain:
                        filtered_urls.append(url)
                filtered_urls = list(set(filtered_urls))
                # max_pages = traversal.get("max_pages", 100)
                #  implementing max_pages is difficult cos it keeps adding
                # new 100 pages in each thread.
                current_page_count = response.meta.get('current_page_count', 1)
                next_crawler_id = traversal['next_crawler_id']
                next_parser = get_crawler_from_list(crawler_id=next_crawler_id,
                                                    crawlers=crawlers)

                for url in filtered_urls:
                    current_page_count = current_page_count + 1

                    yield scrapy.Request(url,
                                         callback=self.parse,
                                         meta={
                                             "current_page_count":
                                             current_page_count,
                                             "current_crawler": next_parser,
                                             "crawlers": crawlers
                                         })
        self.post_parse(response=response)
示例#9
0
    def parse_nodes(self, response, nodes):
        """This method is called for the nodes matching the provided tag name
        (itertag). Receives the response and an Selector for each node.
        Overriding this method is mandatory. Otherwise, you spider won't work.
        This method must return either a BaseItem, a Request, or a list
        containing any of them.
        """
        spider_config = response.meta.get("spider_config")
        spiders = response.meta.get("spiders")
        context = self.context or {}
        if None in [spider_config]:
            spider_config = self.spider_config
            spiders = self.spiders

        data = {"url": response.url, "domain": get_domain(response.url)}
        for extractor in spider_config.get('extractors', []):
            extracted_items = []
            for selector in nodes:
                ret = iterate_spider_output(
                    self.parse_node(response, selector, extractor))
                for result_item in self.process_results(response, ret):
                    extracted_items.append(result_item)
            data[extractor['extractor_id']] = {}
            data[extractor['extractor_id']]['entries'] = extracted_items
        context["spider_id"] = spider_config.get("spider_id")
        data['context'] = context
        """
        if spider_traversal_id is None, it means this response originated from the 
        request raised by the start urls. 

        If it is Not None, the request/response is raised some traversal strategy.
        """
        current_request_traversal_id = response.meta.get(
            'current_request_traversal_id', None)
        """
        In xml crawling current_request_traversal_page_count starts from 1, because there is no page 0.
        """
        current_request_traversal_page_count = response.meta.get(
            'current_request_traversal_page_count', 1)
        """
        Note on current_request_spider_id:
        This can never be none, including the ones that are started by start_urls .
        """
        spider_config_id = spider_config.get("spider_id")

        spider_traversals = spider_config.get('traversals', [])
        for traversal in spider_traversals:
            next_spider_id = traversal['next_spider_id']
            iter_param = traversal['iter_param']

            next_spider = get_spider_from_list(spider_id=next_spider_id,
                                               spiders=spiders)

            traversal['allow_domains'] = next_spider.get("allowed_domains", [])
            traversal_id = traversal['traversal_id']
            traversal_max_pages = traversal.get('max_pages', 1)

            traversal_links = []
            is_this_request_from_same_traversal = self.is_this_request_from_same_traversal(
                response, traversal)
            print("is_this_request_from_same_traversal",
                  is_this_request_from_same_traversal)
            print("current_request_traversal_page_count",
                  current_request_traversal_page_count)
            print("traversal_max_pages", traversal_max_pages)
            print(
                " current_request_traversal_page_count <= traversal_max_pages",
                current_request_traversal_page_count <= traversal_max_pages)
            shall_traverse = False

            if current_request_traversal_id is None:
                """
                start urls will not have this traversal_id set, so we should allow then to traverse
                """
                shall_traverse = True

            elif is_this_request_from_same_traversal and current_request_traversal_page_count <= traversal_max_pages:
                """
                This block will be valid for the traversals from same spider_id, ie., pagination of a spider 
                """

                shall_traverse = True

            elif is_this_request_from_same_traversal:
                """
                """
                shall_traverse = True

            elif is_this_request_from_same_traversal is False and current_request_traversal_page_count <= \
                    traversal_max_pages:
                """
                This for the spider_a traversing to spider_b, this is not pagination, but trsversing between 
                spiders.
                """
                shall_traverse = True
            print("shall_traverse: {}".format(traversal_id), shall_traverse)
            if shall_traverse:
                current_url = response.url
                clean_url_without_iter_param = current_url.split(
                    "?")[0] if "?" in current_url else current_url
                # this is already iterating, so ignore.
                print("<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<",
                      clean_url_without_iter_param)
                print("clean_url_without_iter_param",
                      clean_url_without_iter_param)
                traversal_link = "{}?{}={}".format(
                    clean_url_without_iter_param, iter_param,
                    current_request_traversal_page_count + 1)

                print("traversal_link", traversal_link)

                data[traversal_id] = {"traversal_urls": [traversal_link]}
                """
                Then validate for max_pages logic if traversal_id's traversal has any!.
                This is where the further traversal for this traversal_id  is decided 
                """
                max_pages = traversal.get("max_pages", 1)

                current_request_traversal_page_count += 1
                """
                we are already incrementing, the last number, so using <= might make it 6 pages when 
                max_pages is 5 
                """
                if current_request_traversal_page_count <= max_pages:
                    print("=======current_request_traversal_page_count",
                          current_request_traversal_page_count)
                    print("-----------------------------------")
                    yield scrapy.Request(
                        traversal_link,
                        callback=self.parse,
                        errback=self.parse_error,
                        meta={
                            "spider_config":
                            next_spider,
                            "spiders":
                            spiders,
                            "current_request_traversal_id":
                            traversal_id,
                            "current_request_traversal_page_count":
                            current_request_traversal_page_count,
                        })

            print("=================================================")
            print("====traversal_links", traversal_id, len(traversal_links),
                  traversal_links)
            print("=================================================")

        yield data

        self.post_parse(response=response)