def parse(self, response): self._logger.debug("crawled url {}".format(response.request.url)) cur_depth = 0 if 'curdepth' in response.meta: cur_depth = response.meta['curdepth'] # capture raw response item = RawResponseItem() # populated from response.meta item['appid'] = response.meta['appid'] item['crawlid'] = response.meta['crawlid'] item['attrs'] = response.meta['attrs'] # populated from raw HTTP response item["url"] = response.request.url item["response_url"] = response.url item["status_code"] = response.status item["status_msg"] = "OK" item["response_headers"] = self.reconstruct_headers(response) item["request_headers"] = response.request.headers item["body"] = response.body item["links"] = [] # determine whether to continue spidering if cur_depth >= response.meta['maxdepth']: self._logger.debug("Not spidering links in '{}' because" \ " cur_depth={} >= maxdepth={}".format( response.url, cur_depth, response.meta['maxdepth'])) else: # we are spidering -- yield Request for each discovered link link_extractor = LinkExtractor( allow_domains=response.meta['allowed_domains'], allow=response.meta['allow_regex'], deny=response.meta['deny_regex'], deny_extensions=response.meta['deny_extensions']) for link in link_extractor.extract_links(response): # link that was discovered the_url = link.url the_url = the_url.replace('\n', '') item["links"].append({"url": the_url, "text": link.text, }) req = Request(the_url, callback=self.parse) req.meta['priority'] = response.meta['priority'] - 10 req.meta['curdepth'] = response.meta['curdepth'] + 1 if 'useragent' in response.meta and \ response.meta['useragent'] is not None: req.headers['User-Agent'] = response.meta['useragent'] self._logger.debug("Trying to follow link '{}'".format(req.url)) yield req # raw response has been processed, yield to item pipeline yield item
def parse(self, response): # Check url at start of parse to catch links that were potentially redirected. orig_domain = response.url if "orig_domain" in response.meta: orig_domain = response.meta["orig_domain"] else: response.meta["orig_domain"] = orig_domain if not self.validate_link(response.url, orig_domain): return self._logger.debug("starting parse on url {}".format( response.request.url)) cur_depth = 0 if 'curdepth' in response.meta: cur_depth = response.meta['curdepth'] else: response.meta['curdepth'] = cur_depth self._logger.debug("Forming response object") # capture raw response item = RawResponseItem() # populated from response.meta item['appid'] = response.meta['appid'] item['crawlid'] = response.meta['crawlid'] item['attrs'] = response.meta['attrs'] # populated from raw HTTP response item["url"] = response.request.url item["response_url"] = response.url item["status_code"] = response.status item["status_msg"] = "OK" item["response_headers"] = self.reconstruct_headers(response) item["request_headers"] = response.request.headers item["links"] = [] item["curdepth"] = str(cur_depth) is_pdf = False url = response.url.lower() if (url[len(url) - 4:] == '.pdf') or ('.pdf?' in url): is_pdf = True item["is_pdf"] = str(is_pdf) if is_pdf: self._logger.debug("Handling pdf file") self.download_file(response.url) item["body"] = self.pdfparser("temp_document.pdf") else: item["body"] = self.gather_text(response.body) self._logger.debug("Current depth: " + str(cur_depth)) # determine whether to continue spidering if cur_depth >= response.meta['maxdepth']: self._logger.debug("Not spidering links in '{}' because" \ " cur_depth={} >= maxdepth={}".format( response.url, cur_depth, response.meta['maxdepth'])) else: # we are spidering -- yield Request for each discovered link link_extractor = LinkExtractor( allow_domains=response.meta['allowed_domains'], allow=response.meta['allow_regex'], deny=response.meta['deny_regex'], deny_extensions=response.meta['deny_extensions']) for link in link_extractor.extract_links(response): # link that was discovered the_url = link.url the_url = the_url.replace('\n', '') if not self.validate_link(the_url, orig_domain): continue item["links"].append( str({ "url": the_url, "text": link.text, })) req = Request(the_url, callback=self.parse) req.meta['priority'] = response.meta['priority'] - 10 req.meta['curdepth'] = response.meta['curdepth'] + 1 if 'useragent' in response.meta and \ response.meta['useragent'] is not None: req.headers['User-Agent'] = response.meta['useragent'] self._logger.debug("Trying to follow link '{}'".format( req.url)) yield req # raw response has been processed, yield to item pipeline yield item
def parse(self, response): # debug output for receiving the url self._logger.debug("crawled url {}".format(response.request.url)) # step counter for how many pages we have hit step = 0 if 'step' in response.meta: step = response.meta['step'] # Create Item to send to kafka # capture raw response item = RawResponseItem() # populated from response.meta item['appid'] = response.meta['appid'] item['crawlid'] = response.meta['crawlid'] item['attrs'] = response.meta['attrs'] # populated from raw HTTP response item["url"] = response.request.url item["response_url"] = response.url item["status_code"] = response.status item["status_msg"] = "OK" item["response_headers"] = self.reconstruct_headers(response) item["request_headers"] = response.request.headers item["body"] = response.body item["links"] = [] # we want to know how far our spider gets if item['attrs'] is None: item['attrs'] = {} item['attrs']['step'] = step self._logger.debug("Finished creating item") # determine what link we want to crawl link_extractor = LinkExtractor( allow_domains=response.meta['allowed_domains'], allow=response.meta['allow_regex'], deny=response.meta['deny_regex'], deny_extensions=response.meta['deny_extensions']) links = link_extractor.extract_links(response) # there are links on the page if len(links) > 0: self._logger.debug("Attempting to find links") link = random.choice(links) req = Request(link.url, callback=self.parse) # increment our step counter for this crawl job req.meta['step'] = step + 1 # pass along our user agent as well if 'useragent' in response.meta and \ response.meta['useragent'] is not None: req.headers['User-Agent'] = response.meta['useragent'] # debug output self._logger.debug("Trying to yield link '{}'".format(req.url)) # yield the Request to the scheduler yield req else: self._logger.info("Did not find any more links") # raw response has been processed, yield to item pipeline yield item