def parse(self, response): self._logger.debug("crawled url {}".format(response.request.url)) cur_depth = 0 if 'curdepth' in response.meta: cur_depth = response.meta['curdepth'] # capture raw response item = RawResponseItem() # populated from response.meta item['appid'] = response.meta['appid'] item['crawlid'] = response.meta['crawlid'] item['attrs'] = response.meta['attrs'] # populated from raw HTTP response item["url"] = response.request.url item["response_url"] = response.url item["status_code"] = response.status item["status_msg"] = "OK" item["response_headers"] = self.reconstruct_headers(response) item["request_headers"] = response.request.headers # item["body"] = response.body item["body"] = "asdfsdfsdfsdfsdfsdf" item["links"] = [] hxs = Selector(response) cityList = hxs.xpath('//div[@class="header-city-province-text"]/a') for city in cityList: link = city.xpath('@href').extract()[0] cityName = city.xpath("text()").extract()[0] yield Request(url=link, meta={"city": cityName}, callback=self.parse_url, dont_filter=True)
def parse(self, response): self._logger.debug("crawled url {}".format(response.request.url)) cur_depth = 0 if 'curdepth' in response.meta: cur_depth = response.meta['curdepth'] # capture raw response item = RawResponseItem() # populated from response.meta item['appid'] = response.meta['appid'] item['crawlid'] = response.meta['crawlid'] item['attrs'] = response.meta['attrs'] # populated from raw HTTP response item["url"] = response.request.url item["response_url"] = response.url item["status_code"] = response.status item["status_msg"] = "OK" item["response_headers"] = self.reconstruct_headers(response) item["request_headers"] = response.request.headers item["body"] = response.body item["links"] = [] # determine whether to continue spidering if cur_depth >= response.meta['maxdepth']: self._logger.debug("Not spidering links in '{}' because" \ " cur_depth={} >= maxdepth={}".format( response.url, cur_depth, response.meta['maxdepth'])) else: # we are spidering -- yield Request for each discovered link link_extractor = LinkExtractor( allow_domains=response.meta['allowed_domains'], allow=response.meta['allow_regex'], deny=response.meta['deny_regex'], deny_extensions=response.meta['deny_extensions']) for link in link_extractor.extract_links(response): # link that was discovered the_url = link.url the_url = the_url.replace('\n', '') item["links"].append({"url": the_url, "text": link.text, }) req = Request(the_url, callback=self.parse) req.meta['priority'] = response.meta['priority'] - 10 req.meta['curdepth'] = response.meta['curdepth'] + 1 if 'useragent' in response.meta and \ response.meta['useragent'] is not None: req.headers['User-Agent'] = response.meta['useragent'] self._logger.debug("Trying to follow link '{}'".format(req.url)) yield req # raw response has been processed, yield to item pipeline yield item
def _get_item(self): item = RawResponseItem() item['appid'] = 'app' item['crawlid'] = 'crawlid' item['attrs'] = {} item["url"] = "http://dumb.com" item["response_url"] = "http://dumb.com" item["status_code"] = 200 item["status_msg"] = "OK" item["response_headers"] = {} item["request_headers"] = {} item["body"] = "text" item["links"] = [] return item
def _get_internationalized_utf8_item(self): item = RawResponseItem() item['appid'] = 'app' item['crawlid'] = 'crawlid' item['attrs'] = {} item["url"] = "http://dumb.com" item["response_url"] = "http://dumb.com" item["status_code"] = 200 item["status_msg"] = "OK" item["response_headers"] = {} item["request_headers"] = {} item[ "body"] = u"This is a test - Αυτό είναι ένα τεστ - 这是一个测试 - これはテストです" item["links"] = [] item["encoding"] = "utf-8" return item
def _get_internationalized_iso_item(self): item = RawResponseItem() item['appid'] = 'app' item['crawlid'] = 'crawlid' item['attrs'] = {} item["url"] = "http://dumb.com" item["response_url"] = "http://dumb.com" item["status_code"] = 200 item["status_msg"] = "OK" item["response_headers"] = {} item["request_headers"] = {} # Fill the item["body"] with the string 'αυτό είναι ένα τεστ' that was encoded in iso-8859-7 # using iconv and further encoded in base64 in order to store it inside this file. item["body"] = base64.b64decode('4fX0/CDl3+3h6SDd7eEg9OXz9Ao=') item["links"] = [] item["encoding"] = "iso-8859-7" return item
def parse(self, response): self._logger.debug("crawled url {}".format(response.request.url)) # capture raw response item = RawResponseItem() # populated from response.meta item['appid'] = response.meta['appid'] item['crawlid'] = response.meta['crawlid'] item['attrs'] = response.meta['attrs'] # populated from raw HTTP response item["url"] = response.request.url item["response_url"] = response.url item["status_code"] = response.status item["response_headers"] = self.reconstruct_headers(response) item["request_headers"] = response.request.headers # raw response has been processed, yield to item pipeline self._logger.debug("Created Item successfully") yield item
def parse(self, response): self._logger.debug("crawled url {}".format(response.request.url)) cur_depth = 0 if 'curdepth' in response.meta: cur_depth = response.meta['curdepth'] # capture raw response item = RawResponseItem() # populated from response.meta item['appid'] = response.meta['appid'] item['crawlid'] = response.meta['crawlid'] item['attrs'] = response.meta['attrs'] # populated from raw HTTP response item["url"] = response.request.url item["response_url"] = response.url item["status_code"] = response.status item["status_msg"] = "OK" item["response_headers"] = self.reconstruct_headers(response) item["request_headers"] = response.request.headers item["body"] = response.body # item["body"] = "asdfsdfsdfsdfsdfsdf" item["links"] = [] soup = BeautifulSoup(response.body.decode("utf-8"), "lxml") citys = soup.find("div", class_="area-city-letter").find_all("a") for city in citys: cityspell = city['href'] cityname = city.get_text(strip=True) link = self.root_url + cityspell + "ershouche/" req = Request(link, meta={"city": cityname}, callback=self.parse_list, dont_filter=True) if 'useragent' in response.meta and response.meta[ 'useragent'] is not None: req.headers['User-Agent'] = response.meta['useragent'] yield req self.logger.info(req) yield item
def parse(self, response): # debug output for receiving the url self._logger.debug("crawled url {}".format(response.request.url)) # step counter for how many pages we have hit step = 0 if 'step' in response.meta: step = response.meta['step'] # Create Item to send to kafka # capture raw response item = RawResponseItem() # populated from response.meta item['appid'] = response.meta['appid'] item['crawlid'] = response.meta['crawlid'] item['attrs'] = response.meta['attrs'] # populated from raw HTTP response item["url"] = response.request.url item["response_url"] = response.url item["status_code"] = response.status item["status_msg"] = "OK" item["response_headers"] = self.reconstruct_headers(response) item["request_headers"] = response.request.headers item["body"] = response.body item["links"] = [] # we want to know how far our spider gets if item['attrs'] is None: item['attrs'] = {} item['attrs']['step'] = step self._logger.debug("Finished creating item") # determine what link we want to crawl link_extractor = LinkExtractor( allow_domains=response.meta['allowed_domains'], allow=response.meta['allow_regex'], deny=response.meta['deny_regex'], deny_extensions=response.meta['deny_extensions']) links = link_extractor.extract_links(response) # there are links on the page if len(links) > 0: self._logger.debug("Attempting to find links") link = random.choice(links) req = Request(link.url, callback=self.parse) # increment our step counter for this crawl job req.meta['step'] = step + 1 # pass along our user agent as well if 'useragent' in response.meta and \ response.meta['useragent'] is not None: req.headers['User-Agent'] = response.meta['useragent'] # debug output self._logger.debug("Trying to yield link '{}'".format(req.url)) # yield the Request to the scheduler yield req else: self._logger.info("Did not find any more links") # raw response has been processed, yield to item pipeline yield item
def parse(self, response): self._logger.debug("crawled url {}".format(response.request.url)) self._increment_status_code_stat(response) if 'curdepth' in response.meta: cur_depth = response.meta['curdepth'] # capture raw response item = RawResponseItem() # populated from response.meta item['appid'] = response.meta['appid'] item['crawlid'] = response.meta['crawlid'] item['attrs'] = response.meta['attrs'] # populated from raw HTTP response item["url"] = response.request.url item["response_url"] = response.url item["status_code"] = response.status item["status_msg"] = "OK" item["response_headers"] = self.reconstruct_headers(response) item["request_headers"] = response.request.headers item["body"] = response.body item["links"] = [] if isinstance(response, (SplashResponse, SplashTextResponse)): if "png" in response.data: print " @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ " print " @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ " print " @@@@@@@@@@@@@@@@@@@@ image @@@@@@@@@@@@@@@@@@@@@@ " print " @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ " print " @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ " item["image"] = response.data['png'] # login # if response.url == 'http://fyqe73pativ7vdif.onion/login/': # if response.url == 'http://mt3plrzdiyqf6jim.onion/renewal/login.php': if response.url in response.meta['login'] and response.status == 200: _id = response.meta['login'][response.url]['loginid'] _pass = response.meta['login'][response.url]['password'] # print response.body # data, url, method = fill_login_form(response.url, response.body, 'w-_-w', '1234567890') # data, url, method = fill_login_form(response.url, response.body, '0x0', '1234567890') data, url, method = fill_login_form(response.url, response.body, _id, _pass) yield FormRequest(url, formdata=dict(data), method=method, callback=self.parse, meta=make_splash_meta(response.meta)) else: cur_depth = 0 # determine whether to continue spidering if response.meta['maxdepth'] != -1 and cur_depth >= response.meta[ 'maxdepth']: self._logger.debug("Not spidering links in '{}' because" \ " cur_depth={} >= maxdepth={}".format( response.url, cur_depth, response.meta['maxdepth'])) else: # we are spidering -- yield Request for each discovered link link_extractor = LinkExtractor( deny_domains=response.meta['denied_domains'], allow_domains=response.meta['allowed_domains'], allow=response.meta['allow_regex'], deny=response.meta['deny_regex'], deny_extensions=response.meta['deny_extensions']) for link in link_extractor.extract_links(response): # link that was discovered item["links"].append({ "url": link.url, "text": link.text, }) req = Request(link.url, callback=self.parse, meta=make_splash_meta({})) # pass along all known meta fields for key in response.meta.keys(): if key != 'splash' and key != 'request': req.meta[key] = response.meta[key] if '_splash_processed' in req.meta: req.meta.pop("_splash_processed") req.meta['priority'] = response.meta['priority'] - 10 req.meta['curdepth'] = response.meta['curdepth'] + 1 if 'useragent' in response.meta and \ response.meta['useragent'] is not None: req.headers['User-Agent'] = response.meta['useragent'] self._logger.debug("Trying to follow link '{}'".format( req.url)) yield req # raw response has been processed, yield to item pipeline yield item
def parse(self, response): # Check url at start of parse to catch links that were potentially redirected. orig_domain = response.url if "orig_domain" in response.meta: orig_domain = response.meta["orig_domain"] else: response.meta["orig_domain"] = orig_domain if not self.validate_link(response.url, orig_domain): return self._logger.debug("starting parse on url {}".format( response.request.url)) cur_depth = 0 if 'curdepth' in response.meta: cur_depth = response.meta['curdepth'] else: response.meta['curdepth'] = cur_depth self._logger.debug("Forming response object") # capture raw response item = RawResponseItem() # populated from response.meta item['appid'] = response.meta['appid'] item['crawlid'] = response.meta['crawlid'] item['attrs'] = response.meta['attrs'] # populated from raw HTTP response item["url"] = response.request.url item["response_url"] = response.url item["status_code"] = response.status item["status_msg"] = "OK" item["response_headers"] = self.reconstruct_headers(response) item["request_headers"] = response.request.headers item["links"] = [] item["curdepth"] = str(cur_depth) is_pdf = False url = response.url.lower() if (url[len(url) - 4:] == '.pdf') or ('.pdf?' in url): is_pdf = True item["is_pdf"] = str(is_pdf) if is_pdf: self._logger.debug("Handling pdf file") self.download_file(response.url) item["body"] = self.pdfparser("temp_document.pdf") else: item["body"] = self.gather_text(response.body) self._logger.debug("Current depth: " + str(cur_depth)) # determine whether to continue spidering if cur_depth >= response.meta['maxdepth']: self._logger.debug("Not spidering links in '{}' because" \ " cur_depth={} >= maxdepth={}".format( response.url, cur_depth, response.meta['maxdepth'])) else: # we are spidering -- yield Request for each discovered link link_extractor = LinkExtractor( allow_domains=response.meta['allowed_domains'], allow=response.meta['allow_regex'], deny=response.meta['deny_regex'], deny_extensions=response.meta['deny_extensions']) for link in link_extractor.extract_links(response): # link that was discovered the_url = link.url the_url = the_url.replace('\n', '') if not self.validate_link(the_url, orig_domain): continue item["links"].append( str({ "url": the_url, "text": link.text, })) req = Request(the_url, callback=self.parse) req.meta['priority'] = response.meta['priority'] - 10 req.meta['curdepth'] = response.meta['curdepth'] + 1 if 'useragent' in response.meta and \ response.meta['useragent'] is not None: req.headers['User-Agent'] = response.meta['useragent'] self._logger.debug("Trying to follow link '{}'".format( req.url)) yield req # raw response has been processed, yield to item pipeline yield item
def parse(self, response): print "crawled url", response.request.url cur_depth = 0 if 'curdepth' in response.meta: cur_depth = response.meta['curdepth'] # capture raw response item = RawResponseItem() # populated from response.meta item['appid'] = response.meta['appid'] item['crawlid'] = response.meta['crawlid'] item['attrs'] = response.meta['attrs'] # populated from raw HTTP response item["url"] = response.request.url item["response_url"] = response.url item["status_code"] = response.status item["status_msg"] = "OK" item["headers"] = self.reconstruct_headers(response) item["body"] = response.body item["links"] = [] # determine whether to continue spidering if cur_depth >= response.meta['maxdepth']: self.log("Not spidering links in '{}' because" \ " cur_depth={} >= maxdepth={}".format( response.url, cur_depth, response.meta['maxdepth']), level=INFO) else: # we are spidering -- yield Request for each discovered link link_extractor = LinkExtractor( allow_domains=response.meta['allowed_domains'], allow=response.meta['allow_regex'], deny=response.meta['deny_regex'], deny_extensions=response.meta['deny_extensions']) for link in link_extractor.extract_links(response): # link that was discovered item["links"].append({ "url": link.url, "text": link.text, }) req = Request( link.url, callback=self.parse, meta={ "allowed_domains": response.meta['allowed_domains'], "allow_regex": response.meta['allow_regex'], "deny_regex": response.meta['deny_regex'], "deny_extensions": response.meta['deny_extensions'], "maxdepth": response.meta['maxdepth'], "curdepth": cur_depth + 1, "appid": response.meta['appid'], "crawlid": response.meta['crawlid'], "attrs": response.meta['attrs'], "spiderid": self.name, "expires": response.meta['expires'], "priority": response.meta['priority'] - 10, }, ) self.log("Trying to follow link '{}'".format(req.url), level=INFO) yield req # raw response has been processed, yield to item pipeline yield item
def parse(self, response): self._logger.debug("crawling url {}".format(response.request.url)) is_pdf = False url = response.url.lower() if (url[len(url) - 4:] == '.pdf') or ('.pdf?' in url): self._logger.debug("Found a pdf file, not making a Link object") is_pdf = True else: if "link" not in response.meta: link = Link(response.url) else: link = Link(response.meta["link"]) self._logger.debug("made the link object") self._logger.debug("Link created is of type " + link.type) cur_depth = 0 if 'curdepth' in response.meta: cur_depth = response.meta['curdepth'] # capture raw response item = RawResponseItem() # populated from response.meta item['appid'] = response.meta['appid'] item['crawlid'] = response.meta['crawlid'] item['attrs'] = response.meta['attrs'] # populated from raw HTTP response item["url"] = response.request.url item["response_url"] = response.url item["status_code"] = response.status item["status_msg"] = "OK" item["response_headers"] = self.reconstruct_headers(response) item["request_headers"] = response.request.headers item["body"] = "" item["links"] = [] item["is_pdf"] = str(is_pdf) # if response.meta["maxdepth"] == 0: # response.meta["maxdepth"] = maxint # if "maxdepth" in response.meta and cur_depth >= response.meta['maxdepth']: # self._logger.debug("Not spidering links in '{}' because" \ # " cur_depth={} >= maxdepth={}".format( # response.url, # cur_depth, # response.meta['maxdepth'])) # else: try: # self._logger.debug("Current max depth is " + str(response.meta['maxdepth'])) if is_pdf: self._logger.debug("Downloading pdf file") self.download_file(response.url) self._logger.debug("Parsing pdf file") item["body"] = self.pdfparser("temp_document.pdf") self._logger.debug("Finished handling pdf file") else: self._logger.debug("About to click on link " + link.hrefAttribute) new_links = link.click_and_yield(self.driver) self._logger.debug("Just finished clicking link") item["body"] = link.text for l in new_links: item["links"].append(l.hrefAttribute) request = scrapy.Request(l.hrefAttribute, callback=self.parse) request.meta["link"] = l request.meta['priority'] = max( response.meta['priority'] - 10, 0) request.meta['curdepth'] = response.meta['curdepth'] + 1 if 'useragent' in response.meta and \ response.meta['useragent'] is not None: request.headers['User-Agent'] = response.meta[ 'useragent'] self._logger.debug( "Making a requeset object for this newly found link: '{}'" .format(request.url)) yield request except LinkException: self._logger.debug("Could not click link:" + str(link)) # raw response has been processed, yield to item pipeline self._logger.debug("Yielding item") yield item