Пример #1
0
    def parse_deepweb(self, response):
        scrape = response.selector.xpath("//@href").extract()
        for link in scrape:
            link = link.rstrip("/")
            if ".onion" in link:
                self.log("queued " + link)
                self.onions.add(link)

        self.log("HTTP status 200 received for deepweblinks.org", level=log.INFO)
        self.log("harvested " + str(len(self.onions)) + " .onion links", level=log.INFO)

        item = ToolsItem()  # TODO: actually make a response item??
        return item
Пример #2
0
 def parse_skunksworked(self, response):
     links = response.body.split('\n')
     for link in links:
         if not link:
             continue
         # abc.something.onion => something
         parts = link.split('.')
         if len(parts) == 2:
             link = "http://" + link
             self.onions.add(link)
         elif len(parts) == 3:
             link = "http://" + parts[1] + ".onion/"
             self.onions.add(link)
         else:
             self.log("failed to queue onion link: " + link, level=log.WARNING)
     item = ToolsItem()
     return item
Пример #3
0
    def parse(self, response):
        timestamp = datetime.now().strftime("%y-%m-%d-%H-%M")
        try:
            node = match(r"(http.*abcd\.)(.+?/)",
                         response.url).group(2)  # gets the domain
            if node:
                node = node.replace(".", "").replace("/", "_")
                filename = "/usr/local/lib/ahmia/tor2web_stats/" + node + timestamp + ".json"
                json_str = valid_pretty_json(response.body)
                text2file(json_str, filename)
            else:
                raise Exception
        except Exception as exc:  # catch all
            self.log("Failed to save json data for " + response.url,
                     level=log.ERROR)
            print exc

        item = ToolsItem()
        return item
Пример #4
0
    def parse(self, response):
        if response.status == 200:
            links_count = ""
            scrape = response.selector.xpath("//p[@id='results_count_p']/text()").extract()
            if len(scrape) > 0:
                links_count = find_between(scrape[0], "About ", " results")

            if self.count:
                self.count = links_count

            self.backlinks = \
                [link for link in response.selector.xpath("//h3//a/@href").extract()
                 if not "http://www.google.com/" in link]

        else:
            self.log("backlink spider received HTTP response status {0}"
                     .format(response.status), level=log.WARNING)

        item = ToolsItem()  # TODO: actually make a backlinker response item??
        return item