def __init__(self, thread_count=8): self.dns_tracer = DNSTracer() self.pool = ThreadPool(thread_count) self.asn_tracer = AsnTracer()
class Browser(object): """ Controls the browser (PhantomJS). Visits sites with it, records hosts that were contacted during the page load and does traceroutes to these hosts and the dns servers used to locate them. """ def __init__(self, thread_count=8): self.dns_tracer = DNSTracer() self.pool = ThreadPool(thread_count) self.asn_tracer = AsnTracer() def _hosts_from_page(self, page_url): """ Uses PhantomJS to visit a returns a list of hosts that are connected to, to fetch resources when loading the page. """ browser_proc = envoy.run("phantomjs browser.js " + page_url, timeout=30) urls = [url.strip('"') for url in browser_proc.std_out.split('\n') if url != '' ] netlocs = [urlparse.urlparse(url).netloc for url in urls] contacted_hosts = set(netloc if ':' not in netloc else netloc.split(':')[0] for netloc in netlocs) return contacted_hosts def visit_multiple(self, page_urls): """ The plural version of visit. """ # TODO: parallelize? return map(self.visit, page_urls) def visit(self, page_url): """ Visits a webpage and determines the paths that are traversed when visiting it. """ resource_hosts = self._hosts_from_page(page_url) # This stores the results we care about print page_url page_result = { 'page': page_url, 'resource_hosts': map(self._trace, list(resource_hosts)) } return PageResult(page_result) def _trace(self, host): """ Traces the Asns to a host and to the nameservers used to find the host. Returns the nameservers queried, and the Asns traversed to each host. """ def asn_tracer_dns_helper(contacted_host): """ For use on DNS servers. """ return { 'host': contacted_host, 'traversed_asns': self.asn_tracer.trace(contacted_host) } dirty_queried_dns_servers = self.dns_tracer.trace(host) queried_dns_servers = [dns_server for dns_server in dirty_queried_dns_servers if dns_server and not is_addr_private(dns_server)] return { 'host': host, 'traversed_asns': self.asn_tracer.trace(host), 'queried_dns_servers': self.pool.map( asn_tracer_dns_helper, queried_dns_servers ) }
class Browser(object): """ Controls the browser (PhantomJS). Visits sites with it, records hosts that were contacted during the page load and does traceroutes to these hosts and the dns servers used to locate them. """ def __init__(self, thread_count=8): self.dns_tracer = DNSTracer() self.pool = ThreadPool(thread_count) self.asn_tracer = AsnTracer() def _hosts_from_page(self, page_url): """ Uses PhantomJS to visit a returns a list of hosts that are connected to, to fetch resources when loading the page. """ browser_proc = envoy.run("phantomjs browser.js " + page_url, timeout=30) urls = [ url.strip('"') for url in browser_proc.std_out.split('\n') if url != '' ] netlocs = [urlparse.urlparse(url).netloc for url in urls] contacted_hosts = set( netloc if ':' not in netloc else netloc.split(':')[0] for netloc in netlocs) return contacted_hosts def visit_multiple(self, page_urls): """ The plural version of visit. """ # TODO: parallelize? return map(self.visit, page_urls) def visit(self, page_url): """ Visits a webpage and determines the paths that are traversed when visiting it. """ resource_hosts = self._hosts_from_page(page_url) # This stores the results we care about print page_url page_result = { 'page': page_url, 'resource_hosts': map(self._trace, list(resource_hosts)) } return PageResult(page_result) def _trace(self, host): """ Traces the Asns to a host and to the nameservers used to find the host. Returns the nameservers queried, and the Asns traversed to each host. """ def asn_tracer_dns_helper(contacted_host): """ For use on DNS servers. """ return { 'host': contacted_host, 'traversed_asns': self.asn_tracer.trace(contacted_host) } dirty_queried_dns_servers = self.dns_tracer.trace(host) queried_dns_servers = [ dns_server for dns_server in dirty_queried_dns_servers if dns_server and not is_addr_private(dns_server) ] return { 'host': host, 'traversed_asns': self.asn_tracer.trace(host), 'queried_dns_servers': self.pool.map(asn_tracer_dns_helper, queried_dns_servers) }