def parse_deepweb(self, response): scrape = response.selector.xpath("//@href").extract() for link in scrape: link = link.rstrip("/") if ".onion" in link: self.log("queued " + link) self.onions.add(link) self.log("HTTP status 200 received for deepweblinks.org", level=log.INFO) self.log("harvested " + str(len(self.onions)) + " .onion links", level=log.INFO) item = ToolsItem() # TODO: actually make a response item?? return item
def parse_skunksworked(self, response): links = response.body.split('\n') for link in links: if not link: continue # abc.something.onion => something parts = link.split('.') if len(parts) == 2: link = "http://" + link self.onions.add(link) elif len(parts) == 3: link = "http://" + parts[1] + ".onion/" self.onions.add(link) else: self.log("failed to queue onion link: " + link, level=log.WARNING) item = ToolsItem() return item
def parse(self, response): timestamp = datetime.now().strftime("%y-%m-%d-%H-%M") try: node = match(r"(http.*abcd\.)(.+?/)", response.url).group(2) # gets the domain if node: node = node.replace(".", "").replace("/", "_") filename = "/usr/local/lib/ahmia/tor2web_stats/" + node + timestamp + ".json" json_str = valid_pretty_json(response.body) text2file(json_str, filename) else: raise Exception except Exception as exc: # catch all self.log("Failed to save json data for " + response.url, level=log.ERROR) print exc item = ToolsItem() return item
def parse(self, response): if response.status == 200: links_count = "" scrape = response.selector.xpath("//p[@id='results_count_p']/text()").extract() if len(scrape) > 0: links_count = find_between(scrape[0], "About ", " results") if self.count: self.count = links_count self.backlinks = \ [link for link in response.selector.xpath("//h3//a/@href").extract() if not "http://www.google.com/" in link] else: self.log("backlink spider received HTTP response status {0}" .format(response.status), level=log.WARNING) item = ToolsItem() # TODO: actually make a backlinker response item?? return item