def process_item(self, item, spider, db=None): # Get domain and parsed URL info. domain = Domain.find_stub_by_url(item["url"], db) parsed = ParsedURL(item["url"]) now = datetime.now() # Get or create file. file_row = db.query(File).filter(File.url == item["url"]).scalar() if not file_row: statement = insert(File).values( url=item["url"], domain_id=domain.id, last_crawl=now, size=item["size"], path=parsed.path).on_conflict_do_nothing( index_elements=["url"]) db.execute(statement) file_row = db.query(File).filter(File.url == item["url"]).scalar() # Update file information. file_store = HashedFile.from_data(item["content"], save=False) file_row.last_crawl = now if domain.blacklisted: # Override the old file before replacing the content. file_store.write(BLACKLISTED_BLANK) file_row.content = BLACKLISTED_BLANK elif file_store.read() != item["content"]: file_row.content = item["content"] db.commit() return item
def process_exception(self, response, exception, spider): parsed = ParsedURL(response.url) if isinstance(exception, TwistedTimeoutError): self.server.incr("timeouts:" + md5(parsed.host), 1) self.server.expire("timeouts:" + md5(parsed.host), 60 * 60 * 24) elif exception: self.logger.error("Caught unhandled exception in spider.") self.logger.error(traceback.format_exc())
def process_request(self, request, spider): if not Domain.is_onion_url(request.url): return None parsed = ParsedURL(request.url) subdomains = parsed.host.count(".") if subdomains > 2: raise IgnoreRequest('Too many subdomains (%d > 2)' % subdomains) return None
def process_exception(self, request, exception, spider): parsed = ParsedURL(request.url) if isinstance(exception, TwistedTimeoutError): self.redis.incr("timeouts:" + md5(parsed.host), 1) self.redis.expire("timeouts:" + md5(parsed.host), 60 * 60 * 24) elif exception: spider.logger.error("Caught unhandled exception in handler.") spider.logger.error(traceback.format_exc()) return None
def is_onion_url(url: str): url = url.strip() if not re.match(r"http[s]?://", url): return False try: parsed_url = ParsedURL(url) if onion_regex.match(parsed_url.host): return True else: return False except TypeError: return False
def process_request(self, request, spider): # Allow requests if the max pages is disabled. if self.max_pages == -1: return None parsed = ParsedURL(request.url) page_count = self.pages_script(args=[parsed.host, self.max_pages]) if page_count < self.max_pages: spider.logger.info('Page count is %d for %s' % (page_count, parsed.host)) return None else: raise IgnoreRequest('MAX_PAGES_PER_DOMAIN reached, filtered %s' % request.url)
def find_stub_by_url(cls, url: str, db): page = db.query(Page).filter(Page.url == url).scalar() if not page: domain = Domain.find_stub_by_url(url, db) parsed = ParsedURL(url) statement = insert(Page).values( url=url, domain_id=domain.id, path=parsed.path, ).on_conflict_do_nothing(index_elements=["url"]) db.execute(statement) page = cls.find_stub_by_url(url, db) return page
def process_item(self, item, spider, db=None): # Sanity checks if not item: raise DropItem("Somehow got a blank item dict.") if not Domain.is_onion_url(item["url"]): raise DropItem(f"{item['url']} is not an onion.") now = datetime.now() parsed = ParsedURL(item["url"]) # Get or create domain and update info. domain = Domain.find_stub_by_url(item["url"], db) domain.last_crawl = now domain.alive = item["status_code"] not in BAD_STATUS_CODES if item["frontpage"]: if not (domain.title != '' and item["title"] == ''): domain.title = item["title"] db.commit() # Get or create page. page = Page.find_stub_by_url(item["url"], db) # Update domain information. page.status_code = item["status_code"] page.last_crawl = now page.header_server = item["server"] page.header_powered_by = item["powered_by"] page.title = item["title"] if page.is_frontpage != item["frontpage"]: page.is_frontpage = item["frontpage"] # Update links to. page.links_to = list(item["links_to"]) db.commit() return item
def parse_page_info(self, response): """ Parses the page meta information for the pipeline. Example return: { "host": "someonionpagehostname.onion", "url": "someonionpagehostname.onion/", "status_code": 200, "size": 420, "server": "TotallyReal Server", "powered_by": "IE 6.0", "title": "Page title", "frontpage": True, "content": "<h1>Under Construction</h1>", "links_to": set() } """ page_metadata = { # HTTP headers "host": "", "url": response.url, "status_code": response.status, "size": 0, "server": "", "powered_by": "", # Parsed from page "title": "", "frontpage": False, "content": None, "links_to": set(), "other_links": set(), } # Attempt setting the content try: page_metadata["content"] = response.text except AttributeError: page_metadata["content"] = response.body # Grab the title of the page. try: page_metadata["title"] = response.css( 'title::text').extract_first() except AttributeError: pass except scrapy.exceptions.NotSupported: self.logger.debug(f"Fetched non-text file {response.url}") # Get tor URL "hostname" parsed = ParsedURL(response.url) self.log('Got %s (%s)' % (response.url, page_metadata["title"])) page_metadata["frontpage"] = Page.is_frontpage_request( response.request) page_metadata["size"] = len(response.body) page_metadata["host"] = parsed.host got_server_response = response.status in GOOD_STATUS_CODES # Domain headers if got_server_response: if response.headers.get("Server"): page_metadata["server"] = str(response.headers.get("Server")) if response.headers.get("X-Powered-By"): page_metadata["powered_by"] = str( response.headers.get("X-Powered-By")) if response.headers.get("Powered-By"): page_metadata["powered_by"] = str( response.headers.get("Powered-By")) is_text = False content_type = str(response.headers.get("Content-Type")) if got_server_response and content_type and re.match( '^text/', content_type.strip()): is_text = True # Update links_to if parsed.host not in self.spider_exclude: try: for url in response.xpath('//a/@href').extract(): # Split thhe URL for any onion to clean out web to onion services if they exist. fullurl_parts = response.urljoin(url).split(".onion", 1) # Skip this URL if it has only one part. Onions should have two parts. if len(fullurl_parts) == 1: self.logger.debug( f"Stage 1 dropping non-onion URL '{fullurl_parts[0]}'." ) continue # Some people did things like qwertyuiop.onion.onion/index.php. No idea why but this happened. while fullurl_parts[1].startswith(".onion"): fullurl_parts[1] = fullurl_parts[1].lstrip(".onion") # Merge the parts back together. fullurl = urljoin(fullurl_parts[0] + ".onion", fullurl_parts[1]) # Do additional checks post-merge just in case things happen. if not got_server_response: self.logger.debug( f"Did not get server response from '{fullurl}'.") elif not Domain.is_onion_url(fullurl): self.logger.debug( f"Stage 2 dropping non-onion URL '{fullurl_parts[0]}'." ) # Parse the link and update the lists. try: parsed_link = ParsedURL(fullurl) link_host = parsed_link.host except: continue if parsed.host != link_host: page_metadata["links_to"].add(fullurl) else: page_metadata["other_links"].add(fullurl) if len(page_metadata["links_to"]) <= 5: self.logger.debug("link_to_list len %s %s" % (len( page_metadata["links_to"]), page_metadata["links_to"])) else: self.logger.debug("link_to_list len %s truncated" % (len(page_metadata["links_to"]))) except (AttributeError, scrapy.exceptions.NotSupported): pass return page_metadata
def process_exception(self, request, exception, spider): parsed = ParsedURL(request.url) if exception: self.redis.hincrby("spider:pagecount", parsed.host, -1) return None
if page.id % 250 == 0: print(f"Currently at ID {page.id}.") title = f"{domains_by_id[page.domain_id].host}\n{domains_by_id[page.domain_id].title or 'No title.'}" if page.domain_id not in used_domains: nodes.append({ "id": domains_by_id[page.domain_id].host + ":" + str(domains_by_id[page.domain_id].port), "label": title }) used_domains.add(page.domain_id) for link in page.links_to: parsed = ParsedURL(link) if not onion_regex.match(parsed.host): continue if parsed.host not in domains_by_host: continue link_iters[domains_by_id[page.domain_id].host + ":" + str( domains_by_id[page.domain_id].port)][parsed.host + ":" + str(parsed.port)] += 1 print(f"{len(nodes)} nodes graphed.") # Construct vis data. for parent_node, child_link_list in link_iters.items():
def find_stub_by_url(cls, url, db): parsed = ParsedURL(url) return cls.find_stub(parsed.host, parsed.port, parsed.secure, db)
def is_frontpage_url(url): parsed = ParsedURL(url) if parsed.path == '/': return True return False