def store_response(self, spider, request, response): data = { 'status': response.status, 'domain': get_domain(response.url), 'url': response.url, 'headers': self._clean_headers(response.headers), 'html': response.body, } self.db[self.collection_name].insert_one(data)
def run(self): data = {} meta_tags_data = MetaTagExtractor( response=self.response, extractor=self.extractor, extractor_id=self.extractor_id).run().get(self.extractor_id, {}) paragraphs_data = ParagraphsExtractor( response=self.response, extractor=self.extractor, extractor_id="paragraphs").run().get("paragraphs", {}) # TODO - clean the data, that is extracted. ex: meta_tags_data.get("title") extracted_data = { "title": meta_tags_data.get("title") or meta_tags_data.get("meta__title") or meta_tags_data.get("og__title") or meta_tags_data.get("fb__title") or meta_tags_data.get("meta__twitter__title"), "description": meta_tags_data.get("description") or meta_tags_data.get("meta__description") or meta_tags_data.get("og__description") or meta_tags_data.get("fb__description") or meta_tags_data.get("meta__twitter__description"), "image": meta_tags_data.get("image") or meta_tags_data.get("meta__image") or meta_tags_data.get("og__image") or meta_tags_data.get("fb__image") or meta_tags_data.get("meta__twitter__image"), "url": meta_tags_data.get("url") or meta_tags_data.get("meta__url") or meta_tags_data.get("og__url") or meta_tags_data.get("fb__url") or meta_tags_data.get("meta__twitter__url"), "page_type": meta_tags_data.get("og__type"), "keywords": meta_tags_data.get("meta__keywords"), "domain": get_domain(self.response.url), "first_paragraph": paragraphs_data[0] if len(paragraphs_data) > 0 else None, "shortlink_url": self.response.xpath('//link[@rel="shortlink"]').xpath( "@href").extract_first(), "canonical_url": self.response.xpath('//link[@rel="canonical"]').xpath( "@href").extract_first() } data[self.extractor_id] = extracted_data return data
def store_response(self, spider, request, response): data = { 'status': response.status, 'domain': get_domain(response.url), 'url': response.url, 'html': str(response.body).lstrip("b'").strip("'").replace( "\\n", "").replace("\\t", "").replace("\\\\", "\\"), 'created': datetime.now() } data.update( self._flatten_headers(self._clean_headers(response.headers))) self.WebLink(meta={'id': get_urn(response.url)}, **data).save()
def parse_node(self, response, node): title = self.get_or_none(node.select('title/text()')) url = self.get_or_none(node.select('link/text()')) description = self.get_or_none(node.select('description/text()')) pub_date = self.get_or_none(node.select('pubDate/text()')) category = self.get_or_none(node.select('category/text()')) # image = node.select('item/media:content/url') item = {} item['title'] = title item['url'] = url item['pub_date'] = pub_date item['category'] = category item['description'] = description item['domain'] = get_domain(response.url) return item
def parse(self, response): # parse downloaded content with feedparser (NOT re-downloading with feedparser) feed = self.parse_feed(response.body) if feed: # grab some feed elements # - https://pythonhosted.org/feedparser/common-rss-elements.html # - https://pythonhosted.org/feedparser/common-atom-elements.html # ns = feed.namespaces # feed_title = feed.feed.title # feed_link = feed.feed.link # feed_desc = feed.feed.description for entry in feed.entries: # have content? content = entry.get('content') if content: # content = content[0] content = content[0]['value'] item = { # global feed data # 'feed_title': feed_title, # 'feed_link': feed_link, # 'feed_description': feed_desc, # # item entry data # 'url': response.url, 'url': entry.link, 'title': entry.title, 'domain': get_domain(response.url), 'description': entry.description, # 'date': entry.published, # 'date': entry.published_parsed, 'pub_date': entry.updated_parsed, # optional 'content': content, 'type': entry.get('dc_type'), } yield item
def run(self): data = {} extracted_data = AllLinksExtractor( response=self.response, extractor=self.extractor, extractor_id="all_links" ).run().get("all_links", {}) links_data = {} for link in extracted_data: domain = get_domain(link) if domain in links_data.keys(): links_data[domain].append(link) else: links_data[domain] = [link] data[self.extractor_id] = [{"domain": domain, "links": domain_links, "links_count": domain_links.__len__()} for domain, domain_links in links_data.items()] return data
def store_response(self, spider, request, response): data = { 'status': response.status, 'domain': get_domain(response.url), 'url': response.url, 'html': str(response.body).lstrip("b'").strip("'").replace( "\\n", "").replace("\\t", "").replace("\\\\", "\\"), 'created': datetime.now() } data.update( self._flatten_headers(self._clean_headers(response.headers))) data = self.map_to_solr_datatypes(data=data) data['id'] = self.clean_str(get_urn(response.url)) self.solr.add([data])
def parse(self, response=None): current_crawler = response.meta.get("current_crawler") crawlers = response.meta.get("crawlers") context = self.context if None in [crawlers, current_crawler]: current_crawler = self.current_crawler crawlers = self.crawlers data = {} for extractor in current_crawler['parsers']: extracted_data = self.run_extractor(response=response, extractor=extractor) data.update(extracted_data) if context is not None: data.update({"context": context}) data['url'] = response.url data['domain'] = get_domain(response.url) data['context']['crawler_id'] = current_crawler['crawler_id'] yield data for traversal in current_crawler.get('traversals', []): if traversal['traversal_type'] == "pagination": # TODO - move this to run_pagination_traversal(self, response=None, traversal=None) method; traversal_config = traversal['pagination'] next_crawler_id = traversal['next_crawler_id'] max_pages = traversal_config.get("max_pages", 1) current_page_count = response.meta.get('current_page_count', 1) if current_page_count < max_pages: next_selector = traversal_config.get('selector') if next_selector: if traversal_config.get('selector_type') == 'css': next_page = response.css( next_selector + "::attr(href)").extract_first() elif traversal_config.get('selector_type') == 'xpath': next_page = response.xpath( next_selector + "::attr(href)").extract_first() else: next_page = None current_page_count = current_page_count + 1 if next_page: if not "://" in next_page: next_page_url = "https://" + get_domain( response.url) + next_page else: next_page_url = next_page next_crawler = get_crawler_from_list( crawler_id=next_crawler_id, crawlers=crawlers) yield scrapy.Request(next_page_url, callback=self.parse, meta={ "current_page_count": current_page_count, "current_crawler": next_crawler, "crawlers": crawlers }) elif traversal['traversal_type'] == TRAVERSAL_LINK_FROM_FIELD: next_crawler_id = traversal['next_crawler_id'] traversal_config = traversal[TRAVERSAL_LINK_FROM_FIELD] subdocument_key = self.get_subdocument_key( crawler=current_crawler, parser_id=traversal_config['parser_id']) for item in data.get(traversal_config['parser_id']).get( subdocument_key, []): traversal_url = item[traversal[TRAVERSAL_LINK_FROM_FIELD] ['selector_id']] if traversal_url: if "://" not in traversal_url: # TODO - fix this monkey patch url_parsed = urlparse(response.url) traversal_url = url_parsed.scheme + "://" + url_parsed.netloc + "/" + traversal_url.lstrip( "/") next_crawler = get_crawler_from_list( crawler_id=next_crawler_id, crawlers=crawlers) yield scrapy.Request(traversal_url, callback=self.parse, meta={ "crawlers": crawlers, "current_crawler": next_crawler, }) else: print("ignoring traversal to {}".format(traversal_url)) elif traversal['traversal_type'] == TRAVERSAL_SAME_DOMAIN_FIELD: all_urls = response.css("a::attr(href)").extract() filtered_urls = [] all_urls = list(set(all_urls)) current_domain = get_domain(response.url) for url in all_urls: url = get_absolute_url(url=url, origin_url=response.url) if get_domain(url) == current_domain: filtered_urls.append(url) filtered_urls = list(set(filtered_urls)) # max_pages = traversal.get("max_pages", 100) # implementing max_pages is difficult cos it keeps adding # new 100 pages in each thread. current_page_count = response.meta.get('current_page_count', 1) next_crawler_id = traversal['next_crawler_id'] next_parser = get_crawler_from_list(crawler_id=next_crawler_id, crawlers=crawlers) for url in filtered_urls: current_page_count = current_page_count + 1 yield scrapy.Request(url, callback=self.parse, meta={ "current_page_count": current_page_count, "current_crawler": next_parser, "crawlers": crawlers }) self.post_parse(response=response)
def parse_nodes(self, response, nodes): """This method is called for the nodes matching the provided tag name (itertag). Receives the response and an Selector for each node. Overriding this method is mandatory. Otherwise, you spider won't work. This method must return either a BaseItem, a Request, or a list containing any of them. """ spider_config = response.meta.get("spider_config") spiders = response.meta.get("spiders") context = self.context or {} if None in [spider_config]: spider_config = self.spider_config spiders = self.spiders data = {"url": response.url, "domain": get_domain(response.url)} for extractor in spider_config.get('extractors', []): extracted_items = [] for selector in nodes: ret = iterate_spider_output( self.parse_node(response, selector, extractor)) for result_item in self.process_results(response, ret): extracted_items.append(result_item) data[extractor['extractor_id']] = {} data[extractor['extractor_id']]['entries'] = extracted_items context["spider_id"] = spider_config.get("spider_id") data['context'] = context """ if spider_traversal_id is None, it means this response originated from the request raised by the start urls. If it is Not None, the request/response is raised some traversal strategy. """ current_request_traversal_id = response.meta.get( 'current_request_traversal_id', None) """ In xml crawling current_request_traversal_page_count starts from 1, because there is no page 0. """ current_request_traversal_page_count = response.meta.get( 'current_request_traversal_page_count', 1) """ Note on current_request_spider_id: This can never be none, including the ones that are started by start_urls . """ spider_config_id = spider_config.get("spider_id") spider_traversals = spider_config.get('traversals', []) for traversal in spider_traversals: next_spider_id = traversal['next_spider_id'] iter_param = traversal['iter_param'] next_spider = get_spider_from_list(spider_id=next_spider_id, spiders=spiders) traversal['allow_domains'] = next_spider.get("allowed_domains", []) traversal_id = traversal['traversal_id'] traversal_max_pages = traversal.get('max_pages', 1) traversal_links = [] is_this_request_from_same_traversal = self.is_this_request_from_same_traversal( response, traversal) print("is_this_request_from_same_traversal", is_this_request_from_same_traversal) print("current_request_traversal_page_count", current_request_traversal_page_count) print("traversal_max_pages", traversal_max_pages) print( " current_request_traversal_page_count <= traversal_max_pages", current_request_traversal_page_count <= traversal_max_pages) shall_traverse = False if current_request_traversal_id is None: """ start urls will not have this traversal_id set, so we should allow then to traverse """ shall_traverse = True elif is_this_request_from_same_traversal and current_request_traversal_page_count <= traversal_max_pages: """ This block will be valid for the traversals from same spider_id, ie., pagination of a spider """ shall_traverse = True elif is_this_request_from_same_traversal: """ """ shall_traverse = True elif is_this_request_from_same_traversal is False and current_request_traversal_page_count <= \ traversal_max_pages: """ This for the spider_a traversing to spider_b, this is not pagination, but trsversing between spiders. """ shall_traverse = True print("shall_traverse: {}".format(traversal_id), shall_traverse) if shall_traverse: current_url = response.url clean_url_without_iter_param = current_url.split( "?")[0] if "?" in current_url else current_url # this is already iterating, so ignore. print("<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<", clean_url_without_iter_param) print("clean_url_without_iter_param", clean_url_without_iter_param) traversal_link = "{}?{}={}".format( clean_url_without_iter_param, iter_param, current_request_traversal_page_count + 1) print("traversal_link", traversal_link) data[traversal_id] = {"traversal_urls": [traversal_link]} """ Then validate for max_pages logic if traversal_id's traversal has any!. This is where the further traversal for this traversal_id is decided """ max_pages = traversal.get("max_pages", 1) current_request_traversal_page_count += 1 """ we are already incrementing, the last number, so using <= might make it 6 pages when max_pages is 5 """ if current_request_traversal_page_count <= max_pages: print("=======current_request_traversal_page_count", current_request_traversal_page_count) print("-----------------------------------") yield scrapy.Request( traversal_link, callback=self.parse, errback=self.parse_error, meta={ "spider_config": next_spider, "spiders": spiders, "current_request_traversal_id": traversal_id, "current_request_traversal_page_count": current_request_traversal_page_count, }) print("=================================================") print("====traversal_links", traversal_id, len(traversal_links), traversal_links) print("=================================================") yield data self.post_parse(response=response)