예제 #1
0
파일: validation.py 프로젝트: rhoerbe/peer
def validate_non_public_suffix(value):
    """
    check that value is not a public suffix
    """
    psl = PublicSuffixList()
    if psl.get_public_suffix(value) != psl.get_public_suffix("x." + value):
        raise ValidationError(_("You cannot register public suffixes"))
예제 #2
0
파일: validation.py 프로젝트: Emergya/peer
def validate_non_public_suffix(value):
    '''
    check that value is not a public suffix
    '''
    psl = PublicSuffixList()
    if psl.get_public_suffix(value) != psl.get_public_suffix('x.' + value):
        raise ValidationError(_('You cannot register public suffixes'))
예제 #3
0
def validate_non_public_suffix(value):
    '''
    check that value is not a public suffix
    '''
    psl = PublicSuffixList()
    if psl.get_public_suffix(value) != psl.get_public_suffix('x.' + value):
        raise ValidationError(_('You cannot register public suffixes'))
예제 #4
0
    def run(self):

        main_domain = _get_maindomain()

        all_domains = domain_list()["domains"]
        for domain in all_domains:
            self.logger_debug("Diagnosing DNS conf for %s" % domain)
            is_subdomain = domain.split(".", 1)[1] in all_domains
            for report in self.check_domain(domain,
                                            domain == main_domain,
                                            is_subdomain=is_subdomain):
                yield report

        # Check if a domain buy by the user will expire soon
        psl = PublicSuffixList()
        domains_from_registrar = [
            psl.get_public_suffix(domain) for domain in all_domains
        ]
        domains_from_registrar = [
            domain for domain in domains_from_registrar if "." in domain
        ]
        domains_from_registrar = set(domains_from_registrar) - set(
            YNH_DYNDNS_DOMAINS + ["netlib.re"])
        for report in self.check_expiration_date(domains_from_registrar):
            yield report
예제 #5
0
    def __init__(self,
                 pipes=None,
                 autoreload=False,
                 frequency=600,
                 aliases=None,
                 cache_enabled=True,
                 observers=None,
                 store=None):

        if aliases is None:
            aliases = ATTRS
        if not observers:
            observers = []
        if not pipes:
            pipes = []
        self._pipes = pipes
        self.cache_enabled = cache_enabled
        self.lock = ReadWriteLock()
        self.plumbings = [plumbing(v) for v in pipes]
        self.refresh = MDUpdate(cherrypy.engine, server=self, frequency=frequency)
        self.refresh.subscribe()
        self.aliases = aliases
        self.psl = PublicSuffixList()
        self.md = MDRepository(metadata_cache_enabled=self.cache_enabled, store=store)

        if autoreload:
            for f in pipes:
                cherrypy.engine.autoreload.files.add(f)
예제 #6
0
def receiver_policy(host):
    # type: (str) -> ReceiverPolicy
    '''Get the DMARC receiver policy for a host.

:param str host: The host to lookup.
:returns: The DMARC reciever policy for the host.
:rtype:  A member of the :class:`gs.dmarc.ReceiverPolicy` enumeration.

The :func:`receiver_policy` function looks up the DMARC reciever polciy
for ``host``. If the host does not have a pubished policy
`the organizational domain`_ is determined. The DMARC policy for the
organizational domain is queried, and the subdomain policy is reuturned
(if specified) or the overall policy for the domain is returned.
Internally the :func:`gs.dmarc.lookup.lookup_receiver_policy` is used to
perform the query.

.. _the organizational domain:
   http://tools.ietf.org/html/draft-kucherawy-dmarc-base-04#section-3.2'''
    hostSansDmarc = host if host[:7] != '_dmarc.' else host[7:]

    retval = lookup_receiver_policy(hostSansDmarc)
    if retval == ReceiverPolicy.noDmarc:
        fn = get_suffix_list_file_name()
        with open(fn) as suffixList:
            psl = PublicSuffixList(suffixList)
            newHost = psl.get_public_suffix(hostSansDmarc)
        retval = lookup_receiver_policy(newHost, policyTag='sp')
    return retval
예제 #7
0
파일: lookup.py 프로젝트: alexymik/gs.dmarc
def receiver_policy(host):
    '''Get the DMARC receiver policy for a host.

:param str host: The host to lookup.
:returns: The DMARC reciever policy for the host.
:rtype:  A member of the :class:`gs.dmarc.ReceiverPolicy` enumeration.

The :func:`receiver_policy` function looks up the DMARC reciever polciy
for ``host``. If the host does not have a pubished policy
`the organizational domain`_ is determined and the DMARC policy for this is
returned. Internally the :func:`gs.dmarc.lookup.lookup_receiver_policy` is
used to perform the query.

.. _the organizational domain:
   http://tools.ietf.org/html/draft-kucherawy-dmarc-base-04#section-3.2'''
    hostSansDmarc = host if host[:7] != '_dmarc.' else host[7:]

    retval = lookup_receiver_policy(hostSansDmarc)
    if retval == ReceiverPolicy.noDmarc:
        # TODO: automatically update the suffix list data file
        # <https://publicsuffix.org/list/effective_tld_names.dat>
        fn = get_suffix_list_file_name()
        with open(fn, 'r') as suffixList:
            psl = PublicSuffixList(suffixList)
            newHost = psl.get_public_suffix(hostSansDmarc)
        # TODO: Look up the subdomain policy
        retval = lookup_receiver_policy(newHost)
    return retval
예제 #8
0
def normdomain(domain):
    global _psl

    if _psl is None:
        _psl=PublicSuffixList(open(ssl_config.psl_filename, encoding='utf-8'))

    suf=_psl.get_public_suffix(domain)
    return domain if domain==suf else '*.%s'%suf
예제 #9
0
def get_exact_domain(url):
    psl = PublicSuffixList()
    url = url.strip()
    u = urlparse(url)
    h = u.hostname
    if not h:
        h = url
    return psl.get_public_suffix(h)
def getPublicSuffixDomain(host):
    if not host:
        return "other"
    psl = PublicSuffixList()
    domain = psl.get_public_suffix(host)
    if "." not in domain:
        domain = "other"
    return domain
예제 #11
0
def get_exact_domain(url):
    psl = PublicSuffixList()
    url = url.strip()
    u = urlparse(url)
    h = u.hostname
    if not h:
        h = url
    return psl.get_public_suffix(h)
예제 #12
0
def normdomain(domain):
    global _psl

    if _psl is None:
        _psl = PublicSuffixList(open(ssl_config.psl_filename,
                                     encoding='utf-8'))

    suf = _psl.get_public_suffix(domain)
    return domain if domain == suf else '*.%s' % suf
예제 #13
0
def get_dmarc_record(hostname, use_cache=True,
                     cache_expiration=DEFAULT_EXPIRATION_TIME):
    """Return a DMARC record for a given hostname.

    This will query the DNS records for a hostname, returning a parsed version
    of the DMARC record, if found. If a record could not be found for the
    hostname, the organizational domain will be used instead (which is
    generally example.com for foo.bar.example.com, but this depends on the
    domain in question).

    By default, the fetched record from DNS is cached, allowing this to be
    called multiple times without repeated DNS queries. This is optional,
    as is the expiration time for the cached data (which defaults to 1 month).

    Args:
        hostname (unicode):
            The hostname to look up the DMARC information from.

        use_cache (bool, optional):
            Whether to use the cache for looking up and storing record data.

        cache_expiration (int, optional):
            The expiration time for cached data.

    Returns:
        DmarcRecord:
        The DMARC record. If it could not be found, ``None`` will be returned
        instead.
    """
    record = _fetch_dmarc_record(hostname=hostname,
                                 use_cache=use_cache,
                                 cache_expiration=cache_expiration)

    if not record:
        # We need to fetch from the Organizational Domain for the hostname
        # provided. For this, we need to look up from a Public Suffix list.
        # The third-party module 'publicsuffix' will help us find that
        # domain, in combination with a data file we must ship.
        filename = 'mail/public_suffix_list.dat'

        try:
            stream = pkg_resources.resource_stream('djblets', filename)
            reader = codecs.getreader('utf-8')
            psl = PublicSuffixList(reader(stream))
        except IOError as e:
            logger.error('Unable to read public domain suffix list file '
                         '"%s" from Djblets package: %s',
                         filename, e)
        else:
            new_hostname = psl.get_public_suffix(hostname)

            if new_hostname != hostname:
                record = _fetch_dmarc_record(hostname=new_hostname,
                                             use_cache=use_cache,
                                             cache_expiration=cache_expiration)

    return record
예제 #14
0
def normdomain(domain):
    from publicsuffix import PublicSuffixList
    global _psl

    if _psl is None:
        print('ssl: loading public suffix list')
        _psl=PublicSuffixList(open(ssl_config.psl_filename, encoding='utf-8'))

    suf=_psl.get_public_suffix(domain)
    return domain if domain==suf else '*.%s'%suf
예제 #15
0
def isHaveLongDigit(url):
    hostname = PublicSuffixList().get_public_suffix(url)


    m = re.match(r'\d*',hostname.split('.')[0])
    if m != None:
        if len(m.group()) >6:
            return 1

    
    return 0
예제 #16
0
def get_exact_domain( args ):
    ret = []
    psl = PublicSuffixList()
    for url in args:
        url = url.strip()
        u = urlparse(url)
        h = u.hostname
        if not h:
            h = url
        ret.append(psl.get_public_suffix(h))
    return ret
예제 #17
0
def normdomain(domain):
    from publicsuffix import PublicSuffixList
    global _psl

    if _psl is None:
        print('utils: loading public suffix list')
        _psl = PublicSuffixList(open(ssl_config.psl_filename,
                                     encoding='utf-8'))

    suf = _psl.get_public_suffix(domain)
    return domain if domain == suf else '*.%s' % suf
예제 #18
0
    def __init__(self):
        self.tlds = self._load_tlds()
        self.alexa_top = self._load_alexa_whitelist()
        self.psl = PublicSuffixList(input_file=codecs.open(
            GENERATOR_CONFIG['tld_names_file'], "r", "utf8"))
        self.homoglyphs_confusables = self.loadconfusables()

        self.keyboards = [
            GENERATOR_CONFIG['qwerty'], GENERATOR_CONFIG['qwertz'],
            GENERATOR_CONFIG['azerty']
        ]

        self.homoglyphs = GENERATOR_CONFIG['homoglyphs']

        self.typo_domains = list()
예제 #19
0
    def __init__(self, **kwargs):
        """ args - passed in by optparse """
        self.cc_filters = []
        self.asn_filters = []
        self.format = None
        self.has_header = False
        self.infile = None
        self.outfile = None
        self.verbose = False
        self.quiet = False

        self.matchers = {}
        self.repo = NetObjectRepo()

        # regexs are intentionally broad - we'll run more tests later.

        self.matchers["ip"] = {"rex": "(?:\d+\.){3}\d+", "chk_func": self._is_valid_ip, "type": "ip"}
        self.matchers["hostname"] = {
            "rex": "([a-zA-Z0-9\-\.]+\.[0-9a-zA-Z\-\.]+)(?:\d+)?",
            "chk_func": self._is_valid_domain,
            "type": "domain",
        }

        logging.info("Getting Public Suffix List")
        self.psl = PublicSuffixList(self._get_psl_file())
        logging.info("Got Public Suffix List")

        self.parse_args(**kwargs)
예제 #20
0
class Streamifier(BaseStreamifier):
    """
  Use the Public Suffix List <http://publicsuffix.org> to split the messages
  into streams, one per direction per suffix.
  """
    def __init__(self, procs):
        BaseStreamifier.__init__(self, procs)
        self.psl = PublicSuffixList()

    def streamify(self, messages):
        """
    Given a list of messages (each a req, res tuple), return a list of
    Stream objects.
    """
        reqs = defaultdict(list)
        ress = defaultdict(list)
        suffixes = []
        for req, res in messages:
            host = req[':host']
            suffix = self.psl.get_public_suffix(host.split(":", 1)[0])
            if suffix not in suffixes:
                suffixes.append(suffix)
            reqs[suffix].append((req, host))
            ress[suffix].append((res, host))

        streams = []
        for suffix in suffixes:
            streams.append(Stream(suffix, reqs[suffix], 'req', self.procs))
            streams.append(Stream(suffix, ress[suffix], 'res', self.procs))
        return streams
예제 #21
0
class Streamifier(BaseStreamifier):
  """
  Use the Public Suffix List <http://publicsuffix.org> to split the messages
  into streams, one per direction per suffix.
  """
  def __init__(self, procs):
    BaseStreamifier.__init__(self, procs)
    self.psl = PublicSuffixList()

  def streamify(self, messages):
    """
    Given a list of messages (each a req, res tuple), return a list of
    Stream objects.
    """
    reqs = defaultdict(list)
    ress = defaultdict(list)
    suffixes = []
    for req, res in messages:
      host = req[':host']
      suffix = self.psl.get_public_suffix(host.split(":", 1)[0])
      if suffix not in suffixes:
        suffixes.append(suffix)
      reqs[suffix].append((req, host))
      ress[suffix].append((res, host))

    streams = []
    for suffix in suffixes:
      streams.append(Stream(suffix, reqs[suffix], 'req', self.procs))
      streams.append(Stream(suffix, ress[suffix], 'res', self.procs))
    return streams
예제 #22
0
파일: mdx.py 프로젝트: peter-/pyFF
    def __init__(self,
                 pipes=None,
                 autoreload=False,
                 frequency=600,
                 aliases=None,
                 cache_enabled=True,
                 observers=None,
                 store=None):

        if aliases is None:
            aliases = ATTRS
        if not observers:
            observers = []
        if not pipes:
            pipes = []
        self._pipes = pipes
        self.cache_enabled = cache_enabled
        self.lock = ReadWriteLock()
        self.plumbings = [plumbing(v) for v in pipes]
        self.refresh = MDUpdate(cherrypy.engine, server=self, frequency=frequency)
        self.refresh.subscribe()
        self.aliases = aliases
        self.psl = PublicSuffixList()
        self.md = MDRepository(metadata_cache_enabled=self.cache_enabled, store=store)

        if autoreload:
            for f in pipes:
                cherrypy.engine.autoreload.files.add(f)
예제 #23
0
def domain_parser(df,
                  field_list=[
                      "index", "url", "netloc", "domain", "site_name",
                      "pub_suff", "country_code", "country", "major_dom"
                  ]):
    #check if valid url, if so convert to unicode string, else mark as invalid, temporarily add to input DataFrame
    url = df.apply(link_clean, axis=1)
    url.name = 'url'
    urldf = pd.DataFrame(url)
    df['url'] = urldf
    #get netlocs
    nts = df.apply(netloc_grabber, axis=1)
    nts.name = 'netloc'  #name column
    #get country code list and public suffix list
    cclist = countrysufflist()
    psl = PublicSuffixList()
    #get dom data from netlocs
    domdata = nts.apply(dom_details_grabber, args=(psl, cclist))
    #get index as dataFrame and optional column
    original_ind = df.index
    original_ind.name = 'index'
    original_ind = pd.DataFrame(original_ind)
    #merge data (wiki id, )
    mergedf = pd.concat([
        original_ind, urldf,
        pd.DataFrame(nts),
        pd.DataFrame(domdata.tolist())
    ],
                        axis=1)

    return mergedf[field_list]
예제 #24
0
class HTTPAuthority(Authority):
  suffix_list = PublicSuffixList()
  def __init__(self, user_info_obj, host_str, port_int):
    Authority.__init__(self, user_info_obj, host_str, port_int)
  def getDomainUnencoded(self):
    host_unencoded_str = self.getHostUnencodedStr()
    is_ip_address = HTTPAuthority._isIPAddress(host_unencoded_str)
    result = None
    if is_ip_address == False:
      result = HTTPAuthority.suffix_list.get_public_suffix(host_unencoded_str)
    return result
  @staticmethod
  def _isIPAddress(host_unencoded_str):
    IPV4ADDRESS_RE = uri_parser.IPV4ADDRESS_FRAG_RE
    IP_LITERAL_RE = uri_parser.IP_LITERAL_FRAG_RE
    IP_ADDRESS_RE = r"^(?:(?:" + IPV4ADDRESS_RE + r")|(?:" + IP_LITERAL_RE + r"))$"
    regex = re.compile(IP_ADDRESS_RE)
    m = regex.match(host_unencoded_str)
    is_ip_address = m != None
    return is_ip_address
  def copy(self):
    host_unencoded_str = self.getHostUnencodedStr()
    port_int = self.getPortInt()
    next_user_info_obj = self._getUserInfoCopy()
    http_authority = HTTPAuthority(next_user_info_obj, host_unencoded_str, port_int)
    return http_authority
예제 #25
0
파일: pshtt.py 프로젝트: amirian28/pshtt
def load_suffix_list():
    if SUFFIX_CACHE and os.path.exists(SUFFIX_CACHE):
        logging.debug("Using cached suffix list.")
        cache_file = codecs.open(SUFFIX_CACHE, encoding='utf-8')
        suffixes = PublicSuffixList(cache_file)
    else:
        # File does not exist, download current list and cache it at given location.
        logging.debug("Downloading the Public Suffix List...")
        cache_file = fetch()
        content = cache_file.readlines()
        suffixes = PublicSuffixList(content)

        if SUFFIX_CACHE:
            logging.debug("Caching suffix list at %s" % SUFFIX_CACHE)
            utils.write(''.join(content), SUFFIX_CACHE)

    return suffixes
예제 #26
0
    def __init__(self, manager, args, mb_stream, states_context):
        self.logger = logging.getLogger("discovery")
        backend = manager.backend
        self.domain_cache = DomainCacheProxyWeb(backend.domain_metadata)

        try:
            psl_file = codecs.open("public_suffix_list.dat", encoding='utf8')
        except IOError:
            self.logger.exception("Please get the public suffix file from https://publicsuffix.org/")
            raise
        self._suffix_list = PublicSuffixList(psl_file)
        self._states_ctx = states_context
        self.states = backend.states

        self.user_agent = to_native_str(manager.settings.get('USER_AGENT'))
        self.max_pages = int(manager.settings.get('DISCOVERY_MAX_PAGES'))
        super(Discovery, self).__init__(manager, args, mb_stream, states_context)
예제 #27
0
 def __init__(self,
              host,
              port=0,
              auth=None,
              use_ssl=False,
              starttls=False,
              prefix="noreply"):
     self._host = host
     self._port = port
     auth = auth or {}
     self._auth_user = auth.get('user')
     self._auth_password = auth.get('password')
     self._use_ssl = use_ssl
     self._starttls = starttls
     self.psl_file = publicsuffix.fetch()
     self.psl = PublicSuffixList(self.psl_file)
     self.prefix = prefix
예제 #28
0
    def __init__(self, whois_sleep_seconds=10, nameservers=['8.8.8.8', '8.8.4.4'], log_filename=None):
        ''' initialize a worker object to do work
        '''
        self.psl = PublicSuffixList()
        self.whois_sleep_seconds = whois_sleep_seconds
        self.my_resolver = dns.resolver.Resolver()
        self.my_resolver.nameservers = nameservers

        with open(os.path.join(os.path.dirname(__file__), '../aux/whois_server_ips')) as f:
            self.whois_server_ips = json.load(f)

        self.logger = logging.getLogger('snapshooter_worker')
        self.logger.setLevel(logging.DEBUG)
        ch = logging.StreamHandler()
        self.logger.addHandler(ch)
        if log_filename:
            fh = logging.FileHandler(log_filename)
            self.logger.addHandler(fh)
예제 #29
0
파일: mdx.py 프로젝트: lhoekenga/pyFF
    def __init__(self, pipes=None, observers=None):

        if not observers:
            observers = []
        if not pipes:
            pipes = []
        self._pipes = pipes
        self.lock = ReadWriteLock()
        self.plumbings = [plumbing(v) for v in pipes]
        self.refresh = MDUpdate(cherrypy.engine, server=self, frequency=config.frequency)
        self.refresh.subscribe()
        self.aliases = config.aliases
        self.psl = PublicSuffixList()
        self.md = MDRepository(metadata_cache_enabled=config.caching_enabled, store=config.store)

        if config.autoreload:
            for f in pipes:
                cherrypy.engine.autoreload.files.add(f)
예제 #30
0
def main():
	ip_hostname_map = extract_host()									# Get ip::hostname map
	
	try:
		con = lite.connect('router-naming.db')							# Connecting to db
		cur = con.cursor()

		#cur.execute("DROP TABLE IF EXISTS NodeHost")					# Creating tables
		#cur.execute("DROP TABLE IF EXISTS NodeDegree")
		cur.execute("CREATE TABLE NodeHost(IP TEXT, PublicSuffix TEXT, HostName TEXT, NodeID TEXT, NID INT)")
		cur.execute("CREATE TABLE NodeDegree(IpDegree INT, HostNameDegree INT, NodeID TEXT, NID INT)")

		p = PublicSuffixList()											# Stripping public suffix domain

		node_file = open('data_set/midar-iff.nodes', 'r')				# Read in node file (midar-iff.nodes)
		node_file_data = node_file.readlines()

		for row in node_file_data:										# Go through all nodes
			if row[0] != "#":											# Ignore comments
				
				node_id = re.search(r'N\d*', row).group()				# Find node id in form of N<digits>
				node_id_int = node_id[1:]

				match_ip = re.split(r'node\sN\d*:\s*', row)				# Find list of ip addr of this node
				ip_list = match_ip[1].split()

				hit_count = 0											# Count ips found in ip::hostname map

				for ip in ip_list:										# Check all ips of this node
					if ip in ip_hostname_map:
						hit_count += 1
						hostname = ip_hostname_map[ip]
						public_sfx = p.get_public_suffix(hostname)
						cur.execute(r"INSERT INTO NodeHost VALUES('%s', '%s', '%s', '%s', '%s');" % (ip, public_sfx, hostname, node_id, node_id_int))

				if hit_count > 0:										# If there is at least one hit
					cur.execute(r"INSERT INTO NodeDegree VALUES(%d, %d, '%s', '%s');" % (len(ip_list), hit_count, node_id, node_id_int))
		con.commit()

	except lite.Error, e:
		if con:
			con.rollback()
		print "Databse error %s:" % e.args[0]
		sys.exit(1)
예제 #31
0
def get_psl(location=PSL_CACHE_LOC):
    """
    Grabs an updated public suffix list.
    """
    if not os.path.isfile(location):
        psl_file = fetch()
        with codecs.open(location, 'w', encoding='utf8') as f:
            f.write(psl_file.read())
    psl_cache = codecs.open(location, encoding='utf8')
    return PublicSuffixList(psl_cache)
예제 #32
0
def get_psl():
    """
    Grabs an updated public suffix list.
    """
    if not os.path.isfile(PSL_CACHE_LOC):
        print "%s does not exist, downloading a copy." % PSL_CACHE_LOC
        psl_file = fetch()
        with codecs.open(PSL_CACHE_LOC, 'w', encoding='utf8') as f:
            f.write(psl_file.read())
    psl_cache = codecs.open(PSL_CACHE_LOC, encoding='utf8')
    return PublicSuffixList(psl_cache)
예제 #33
0
def main():


	desc="Glenn's Firelamb: This tool will parse pcap files or listen on an interface for cookies. Cookies get saved to a Firefox cookies.sqlite file - one cookie file per observed device. ([email protected])"
	parser=OptionParser(description=desc)
	parser.add_option("-f", "--file", dest="fname",help="Specify pcap file to read")
	parser.add_option("-i", "--interface", dest="iface",help="Specify interface to listen on")
	parser.add_option("-p", "--ip_logging",action="store_true",dest="log_by_ip",default=False,help="Create cookie file per IP address. Default is per device MAC address")
	parser.add_option("-l", "--launch_firefox",dest="launch_ff",action = "store_true",default=False,help="Launch Firefox profiles for the saved cookies")
	parser.add_option("-s", "--karma_sslstrip",dest="sslstriplog",default=None,help="SSLStrip log file")
	parser.add_option("-t", "--karma_sslsplit",dest="sslsplitdir",default=None,help="Directory of SSLSplit log files")

	sqlv=sqlite3.sqlite_version.split('.')
	if (sqlv[0] <3 or sqlv[1] < 7):
		print "MANA (FireLamb) : [!] WARNING. sqlite3 version 3.7 or greater required. You have version %s.I'll try continue, but will likely not be able to write Firefox cookie files." %sqlite3.sqlite_version




	global psl
	global ip_logging
	psl = PublicSuffixList()

	(options, args) = parser.parse_args()
	ip_logging=options.log_by_ip


	if( not options.fname and not options.iface and not options.launch_ff):
			print parser.print_help()
			exit(-1)

	if(options.launch_ff):
		if (options.sslsplitdir):
			parsesslsplit(options.sslsplitdir)
		launch_firefox()

	else:

		if not os.path.exists(save_dir):
    			os.makedirs(save_dir)
		print "MANA (FireLamb) : [+] Saving output to %s" %save_dir


		if(options.iface):
			print "MANA (FireLamb) : [+] Listening for cookie traffic on interface %s" %options.iface
			sniff(iface=options.iface,prn=process)

		elif(options.fname):
			print "MANA (FireLamb) : [+] Reading pcap file '%s'...." %options.fname
			packets=rdpcap(options.fname)
			print "MANA (FireLamb) : [+] Processing file contents..."
			for p in packets:
				process(p)
			print "MANA (FireLamb) : [+] Done."
예제 #34
0
파일: url.py 프로젝트: hayj/DataTools
 def initPublicSuffixList(self):
     if self.psl is not None:
         return self.psl
     try:
         if fileExists(self.pslCachePath) and getLastModifiedTimeSpent(self.pslCachePath, TIMESPENT_UNIT.DAYS) < self.timeSpentMax:
             pslFile = codecs.open(self.pslCachePath, encoding='utf8')
             self.psl = PublicSuffixList(pslFile)
             pslFile.close()
             return self.psl
         else:
             (dir, filename, ext, filenameExt) = decomposePath(self.pslCachePath)
             mkdirIfNotExists(dir)
             pslData = list(publicsuffix.fetch())
             removeIfExists(self.pslCachePath)
             strToFile(pslData, self.pslCachePath)
             self.psl = PublicSuffixList(pslData)
             return self.psl
     except Exception as e:
         logException(e, self, location="initPublicSuffixList")
         return None
예제 #35
0
    def __init__(self, **kwargs):
        self.drone = kwargs.get('drone', "no_drone_name_supplied")
        self.verb = kwargs.get('verbose', 0)
        self.fname = os.path.splitext(
            os.path.basename(os.path.basename(__file__)))[0]
        self.psl = PublicSuffixList()

        self.cookies = fifoDict(names=("drone", "client_mac", "client_ip",
                                       "host", "name", "value", "baseDomain",
                                       "address", "lastAccessed",
                                       "creationTime"))
        self.userAgents = fifoDict(names=("mac", "userAgent"))
예제 #36
0
파일: pshtt.py 프로젝트: siparker/pshtt
def load_suffix_list():
    # File does not exist, download current list and cache it at given location.
    utils.debug("Downloading the Public Suffix List...", divider=True)
    try:
        cache_file = fetch()
    except URLError as err:
        logging.warn("Unable to download the Public Suffix List...")
        utils.debug("{}".format(err))
        return []
    content = cache_file.readlines()
    suffixes = PublicSuffixList(content)
    return suffixes, content
예제 #37
0
def domain_split(server_domain):
    '''''
    server_domain为网站所用服务名+域名
    分割域名, 得到前缀(服务名)、主机域名、后缀(顶级域名)
        输入www.baidu.com,输出'www', 'baidu', 'com'
        输入172.31.137.240,输出'', '172.31.137.240', ''
    '''
    PSL_FILE = codecs.open('public_suffix_list.dat', encoding='utf8')
    psl = PublicSuffixList(PSL_FILE)
    domain = psl.get_public_suffix(server_domain)
    # 取域名的第一个字段,即第一个'.'之前的为主机域名, 后面为顶级域名,前面为所使用的服务
    if '.' in domain:
        server = server_domain[:-len(domain)]
        host = domain[:domain.index('.')]
        top = domain[domain.index('.'):]
        hostname = server + host + top
    else:  # 说明提取域名失败,例如172.31.137.240等IP形式,此时全部当作主机域名
        server = ''
        host = server_domain
        top = ''
        hostname = server_domain
    return server, host, top, hostname
예제 #38
0
파일: pshtt.py 프로젝트: IanLee1521/pshtt
def load_suffix_list():
    if SUFFIX_CACHE and os.path.exists(SUFFIX_CACHE):
        utils.debug("Using cached suffix list.", divider=True)
        cache_file = codecs.open(SUFFIX_CACHE, encoding='utf-8')
        suffixes = PublicSuffixList(cache_file)
    else:
        # File does not exist, download current list and cache it at given location.
        utils.debug("Downloading the Public Suffix List...", divider=True)
        try:
            cache_file = fetch()
        except URLError as err:
            logging.warn("Unable to download the Public Suffix List...")
            utils.debug("{}".format(err))
            return []
        content = cache_file.readlines()
        suffixes = PublicSuffixList(content)

        if SUFFIX_CACHE:
            utils.debug("Caching suffix list at %s" % SUFFIX_CACHE,
                        divider=True)
            utils.write(''.join(content), SUFFIX_CACHE)

    return suffixes
예제 #39
0
class CommentsPortalUrls(NodesEdgesCreator):

    def __init__(self):
        self.psl = PublicSuffixList()

    def create(self, articles):
        articles = filter(lambda a: len(a['comments']) > 0, articles)
        domains = []
        for article in articles:
            for comment in article['comments']:
                user = comment['author']
                urls = self.__extract_urls_from_comment(comment)
                extracted_domains = self.__convert_urls_to_domains(urls)
                for domain in extracted_domains:
                    ds = filter(lambda d: d.url == domain, domains)
                    if len(ds) > 0:
                        d = ds[0]
                    else :
                        d = Domain(domain)
                        domains.append(d)
                    d.add_user(user)
        nodes = self.__create_nodes(domains)
        return nodes, domains

    def __create_nodes(self, domains):
        nodes = []
        for domain in domains:
            nodes.append(domain)
            for user in domain.edges:
                if user not in nodes:
                    nodes.append(user)
        return nodes

    def __extract_urls_from_comment(self, comment):
        html = comment['htmlContent']
        matches = re.findall(r'<a href="(.*?)" target="_blank" rel="nofollow">\1</a>', html)
        return matches

    def __extract_domain(self, url):
        url = url.lower()
        matches = re.search(r'(h|Ht|Tt|Tp|Ps|S)?:[/\\]{2}([wW]{3}\.)?([^:/\\]+)', url)
        match = matches.group(3)
        domain = self.psl.get_public_suffix(match)
        return domain

    def __convert_urls_to_domains(self, urls):
        return map(lambda u: self.__extract_domain(u), urls)
예제 #40
0
    def __init__(self, pipes=None, observers=None):

        if not observers:
            observers = []
        if not pipes:
            pipes = []
        self._pipes = pipes
        self.lock = ReadWriteLock()
        self.plumbings = [plumbing(v) for v in pipes]
        self.refresh = MDUpdate(cherrypy.engine, server=self, frequency=config.frequency)
        self.refresh.subscribe()
        self.aliases = config.aliases
        self.psl = PublicSuffixList()
        self.md = MDRepository(metadata_cache_enabled=config.caching_enabled, store=config.store)

        if config.autoreload:
            for f in pipes:
                cherrypy.engine.autoreload.files.add(f)
예제 #41
0
def run(fname=None,
        iface=None,
        log_by_ip=False,
        launch_ff=False,
        sslstriplog=None,
        sslsplitdir=None):

    global psl
    global ip_logging

    psl = PublicSuffixList()

    ip_logging = log_by_ip

    if (not fname and not iface and not launch_ff):
        print parser.print_help()
        exit(-1)

    if (launch_ff):
        if (sslsplitdir):
            parsesslsplit(sslsplitdir)
        launch_firefox()

    else:

        if not os.path.exists(save_dir):
            os.makedirs(save_dir)
        print "MANA (FireLamb) : [+] Saving output to %s" % save_dir

        if (iface):
            print "MANA (FireLamb) : [+] Listening for cookie traffic on interface %s" % iface
            sniff(iface=iface, prn=process)

        elif (fname):
            print "MANA (FireLamb) : [+] Reading pcap file '%s'...." % fname
            packets = rdpcap(fname)
            print "MANA (FireLamb) : [+] Processing file contents..."
            for p in packets:
                process(p)
            print "MANA (FireLamb) : [+] Done."
예제 #42
0
def alexa_malware_scan(url):
    domain = PublicSuffixList().get_public_suffix(
        urlparse(url).netloc)  # IRIs are going to be a pain here.
    pipe = redis_db["slave"].pipeline()
    pipe.hlen(domains_key)
    pipe.hmget(domains_key, domain)
    total, score = pipe.execute()
    score = score[0]

    def rank_to_ratio(score, total):
        """
        if the score is between 1 and 1 million never return 1
        If the score is none return 1
        """
        if score is not None:
            score = int(score) - 1
            total = total

            return score / total
        else:
            return 1

    return [{"type": "generic", "confidence": rank_to_ratio(score, total)}]
예제 #43
0
파일: mdx.py 프로젝트: johanlundberg/pyFF
    def __init__(self,
                 pipes=None,
                 autoreload=False,
                 frequency=600,
                 aliases=ATTRS,
                 cache_enabled=True,
                 hosts_dir=None,
                 observers=[]):
        if pipes is None:
            pipes = []
        self.cache_enabled = cache_enabled
        self._md = None
        self.lock = ReadWriteLock()
        self.plumbings = [plumbing(v) for v in pipes]
        self.refresh = MDUpdate(cherrypy.engine, server=self, frequency=frequency)
        self.refresh.subscribe()
        self.aliases = aliases
        self.observers = observers
        self.psl = PublicSuffixList()

        if autoreload:
            for f in pipes:
                cherrypy.engine.autoreload.files.add(f)
예제 #44
0
class UrlChecker(object):
	
	psl = None #public suffix list object
	netloc_cache = {}
	
	def __init__(self):
		self.psl = PublicSuffixList()

	def is_valid_url(self, url):
		isUrlOk = False
		
		if len(url):
			#parse url			
			u = urlparse.urlparse(url)
			
			#add scheme if missing
			if u.scheme == u'':
				url = u'http://' + url
			u = urlparse.urlparse(url)
			
			#get netloc
			netloc = u.netloc
			
			if self.netloc_cache.has_key(netloc):
				isUrlOk = self.netloc_cache[netloc]
			else:
				query_domain = self.psl.get_public_suffix(netloc)
				
				#check if it is google
				parts = query_domain.split(u'.')
				
				if len(parts) > 0 and parts[0].upper() == u'GOOGLE':
					isUrlOk = True
					
				self.netloc_cache[netloc] = isUrlOk
		
		return isUrlOk
	def _parse_html(self,html):
		"""parse html; return tuple (array html data, int res_stats, ); extracts result stats, title, url, content, time of content, related links"""
		
		bs = BeautifulSoup(html)
		
		res_docs = []
		
		#get google estimation on number of results
		tot_res_el = bs.find('div',id = 'resultStats')
		tot_res = 0
		if tot_res_el:
			non_decimal = re.compile(r'[^\d]+')
			try:
				tot_res = int(non_decimal.sub('',tot_res_el.text))
			except Exception:
				pass
		
		#instance of Suffix list to extract top level domain from url
		psl = PublicSuffixList()
		
		#get li objects with main search results
		lis = bs.find_all('li','g')
		url_search = re.compile(r'\?q=.*sa=')
		type_search = re.compile(r'(\/url)|(\/images)|(\/search)')
		for li in lis:
			#check url type of title
			h3 = li.select('h3 a')
			container_type = ''
			if len(h3) > 0:
				href = h3[0].get('href')
				container_type = type_search.match(href)
				if container_type:
					container_type = container_type.group().lower()
					container_type = container_type[1:] #remove leading slash
			
			if container_type == u'url':
				doc = dict()
				
				#get title
				ttl_el = li.find(u'',u'r')
				ttl = ttl_el.text
				
				#get title url
				ttl_url = ttl_el.find('a').get('href')
				#clean url
				m = url_search.search(ttl_url)
				if m:
					ttl_url = m.group(0)[3:-4] #3 for ?q=, 4 for &sa=
					ttl_url = urllib.unquote(ttl_url)
					#ttl_url = urllib.unquote(ttl_url)
				
				#get tld
				tld = u''
				try:
					u = urlparse.urlparse(ttl_url)
					tld = psl.get_public_suffix(u.netloc)
				except Exception:
					pass
				
				#get content
				cnt_el = None
				cnt_els = li.select('div.s span.st')
				cnt = u''
				if len(cnt_els) > 0:
					cnt_el = cnt_els[0]
				
				if cnt_el:
					cnt = cnt_el.text
				
				#get time of content
				cnt_time = u''
				if cnt_el:
					#get first <b> element and check its value
					b = cnt_el.find('b')
					if b and b.text == u'...':
						#get time
						cnt_time = cnt_el.strings.next()
						
						if len(cnt_time) > 25:
							cnt_time = u''
				
				#get related links
				rel_links = []
				rel_links_els = li.select('div.s div.osl a')
				for rel_link_el in rel_links_els:
					rel_link = rel_link_el.get('href')
					
					#clean url
					m = url_search.search(rel_link)
					if m:
						rel_link = m.group(0)[3:-4]
						rel_link = urllib.unquote(rel_link)
						#rel_link = urllib.unquote(rel_link)
					
					#add url to array
					rel_links.append(rel_link)
				
				#insert elements into doc
				doc['title'] = ttl
				doc['title_url'] = ttl_url
				doc['tld'] = tld
				doc['content'] = cnt
				doc['content_time'] = cnt_time
				doc['related_links'] = rel_links
				doc['type'] = container_type
				
				#add to docs array
				res_docs.append(doc)
				
			else:
				#here go images, vides, news and other
				pass
		
		#return tuple
		return (res_docs, tot_res, )
	def scrape(self, url, return_data):
		current_task = self.current_task
		logger = self.logger

		url = url.encode('utf-8')
		
		#change status
		current_task.update_state(state=u'STARTED', meta={'url': url, 'group': self.group_name})
		
		logger.info('TASK EXECUTING: %r, args: %r kwargs: %r' % ( self.current_task.request.id,
			self.current_task.request.args, self.current_task.request.kwargs))
		
		#avoid doing anything if url is empty
		if len(url) == 0:
			self._wrong_param_exception(url)

		#parse url
		u = urlparse.urlparse(url)
		
		#add scheme if missing
		if u.scheme == '':
			url = 'http://' + url
		u = urlparse.urlparse(url)
		
		#get netloc
		netloc = u.netloc
		
		#get parsed query
		qs = urlparse.parse_qs(u.query)
		start = (qs['start'] if 'start' in qs else ['0'])[0]
		start = int(start)

		#convert qs elements from array to normal strings
		for k in qs.keys():
			el = qs[k]
			el = ' '.join(el)
			
			#try to convert number strings into numbers
			new_k = k.lower()
			if new_k == 'num' or new_k == 'start':
				el = el.replace(' ', '')
				try:
					el = int(el)
				except Exception:
					pass
			
			qs[k] = el
		
		#add default values for num and start if there are none
		if not 'num' in qs:
			qs['num'] = 10
		if not 'start' in qs:
			qs['start'] = 0
		
		#get domain name of the query
		psl = PublicSuffixList()
		query_domain = psl.get_public_suffix(netloc)
		
		#check if it is google
		parts = query_domain.split(u'.')

		scraped_docs = ''
		if len(parts) > 0 and parts[0].upper() == u'GOOGLE':
			current_task.update_state(state=u'CONNECTING', meta={'url': url, 'group': self.group_name})
			
			#create request
			req = {
				'url': urllib.quote_plus(url, "%/:=&?~#+!$,;'@()*[]"),
				'referer': u'http://google.com',
				'useragent': u'Webscraper/' + CustomAppSettings.get_version() + ' (+http://www.yriver.it/webscraper/)',#'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.47 Safari/537.36',
				'region': u'gb',
				'priority': u'1'
			}
			
			#define which scrape key to take
			scrape_key = get_scrape_key_special if self.plan['is_special'] else get_scrape_key
			
			#make query
			query = {'key': scrape_key(), 'request': req}
			
			p = ProxyConnection(self.user, self.plan)
			html = p.send_request(query)
			
			#parse html
			scraped_docs = ''

			if len(html) > 0 and html != '0':
				scraped_docs, tot_res_est = self._parse_html(html)
				#write into db
				self._db_write_res(task_id=current_task.request.id, url=url, group_name=self.group_name, results=scraped_docs, tot_res=tot_res_est, start=start,
						query=qs, domain=query_domain)
		
		# Convert to Base64 if return_data = True
		if return_data:
			encoded_result = base64.standard_b64encode(json.dumps(scraped_docs))
			return {'url': url, 'group_name': self.group_name, 'domain': query_domain, 'b64_json': encoded_result}
		else:
			return {'url': url, 'group_name': self.group_name}
예제 #47
0
 def __init__(self):
     self.psl = PublicSuffixList()
예제 #48
0
파일: mdx.py 프로젝트: leifj/pyFF
class MDServer(object):
    """The MDServer class is the business logic of pyFF. This class is isolated from the request-decoding logic
    of MDRoot and from the ancilliary classes like MDStats and WellKnown.
    """

    def __init__(self, pipes=None, observers=None):

        if not observers:
            observers = []
        if not pipes:
            pipes = []
        self._pipes = pipes
        self.lock = ReadWriteLock()
        self.plumbings = [plumbing(v) for v in pipes]
        self.refresh = MDUpdate(cherrypy.engine, server=self, frequency=config.update_frequency)
        self.refresh.subscribe()
        self.aliases = config.aliases
        self.psl = PublicSuffixList()
        self.md = MDRepository()
        self.ready = False

        if config.autoreload:
            for f in pipes:
                cherrypy.engine.autoreload.files.add(f)

    def reload_pipeline(self):
        new_plumbings = [plumbing(v) for v in self._pipes]
        self.plumbings = new_plumbings

    class MediaAccept(object):

        def __init__(self):
            pass

        def has_key(self, key):
            return True

        def get(self, item):
            return self.__getitem__(item)

        def __getitem__(self, item):
            try:
                return cptools.accept(item, debug=True)
            except HTTPError:
                return False

    def request(self, **kwargs):
        """The main request processor. This code implements all rendering of metadata.
        """

        if not self.ready:
            raise HTTPError(503, _("Service Unavailable (repository loading)"))

        pfx = kwargs.get('pfx', None)
        path = kwargs.get('path', None)
        content_type = kwargs.get('content_type', None)
        request_type = kwargs.get('request_type', "negotiate")

        log.debug("MDServer pfx=%s, path=%s, content_type=%s" % (pfx, path, content_type))

        def _d(x, do_split=True):
            dot = six.u('.')
            if x is not None:
                x = x.strip()
            # log.debug("_d(%s,%s)" % (x, do_split))
            if x is None or len(x) == 0:
                return None, None

            if x.startswith("{base64}"):
                x = safe_b64d(x[8:])
                if isinstance(x, six.binary_type):
                    x = x.decode()

            if do_split and dot in x:
                (pth, _, extn) = x.rpartition(dot)
                if extn in _ctypes:
                    return pth, extn

            return x, None

        _ctypes = {'xml': 'application/xml',
                   'json': 'application/json',
                   'htm': 'text/html',
                   'html': 'text/html',
                   'ds': 'text/html',
                   's': 'application/json'}

        alias = None
        if pfx:
            alias = pfx
            pfx = self.aliases.get(alias, None)
            if pfx is None:
                raise NotFound()

        path, ext = _d(path, content_type is None)
        if pfx and path:
            q = "{%s}%s" % (pfx, path)
            path = "/%s/%s" % (alias, path)
        else:
            q = path

        if ext is not None:
            log.debug("request path: %s.%s, headers: %s" % (path, ext, cherrypy.request.headers))
        else:
            log.debug("request path: %s, headers: %s" % (path, cherrypy.request.headers))

        accept = {}
        if content_type is None:
            if ext is not None and ext in _ctypes:
                accept = {_ctypes[ext]: True}
            else:
                accept = MDServer.MediaAccept()
                if ext is not None:
                    path = "%s.%s" % (path, ext)
        else:
            accept = {content_type: True}

        with self.lock.readlock:
            if ext == 'ds':
                pdict = dict()
                entity_id = kwargs.get('entityID', None)
                if entity_id is None:
                    raise HTTPError(400, _("400 Bad Request - missing entityID"))

                e = self.md.store.lookup(entity_id)
                if e is None or len(e) == 0:
                    raise HTTPError(404)

                if len(e) > 1:
                    raise HTTPError(400, _("Bad Request - multiple matches for") + " %s" % entity_id)

                pdict['entity'] = entity_simple_summary(e[0])
                if not path:
                    pdict['search'] = "/search/"
                    pdict['list'] = "/role/idp.json"
                else:
                    pdict['search'] = "{}.s".format(escape(path, quote=True))
                    pdict['list'] = "{}.json".format(escape(path, quote=True))

                pdict['storage'] = "/storage/"
                cherrypy.response.headers['Content-Type'] = 'text/html'
                return render_template(config.ds_template, **pdict)
            elif ext == 's':
                paged = bool(kwargs.get('paged', False))
                query = kwargs.get('query', None)
                page = kwargs.get('page', 0)
                page_limit = kwargs.get('page_limit', 10)
                entity_filter = kwargs.get('entity_filter', None)
                related = kwargs.get('related', None)

                cherrypy.response.headers['Content-Type'] = 'application/json'
                cherrypy.response.headers['Access-Control-Allow-Origin'] = '*'

                if query is None:
                    log.debug("empty query - creating one")
                    query = [cherrypy.request.remote.ip]
                    referrer = cherrypy.request.headers.get('referrer', None)
                    if referrer is not None:
                        log.debug("including referrer: %s" % referrer)
                        url = urlparse(referrer)
                        host = url.netloc
                        if ':' in url.netloc:
                            (host, port) = url.netloc.split(':')
                        for host_part in host.rstrip(self.psl.get_public_suffix(host)).split('.'):
                            if host_part is not None and len(host_part) > 0:
                                query.append(host_part)
                    log.debug("created query: %s" % ",".join(query))

                if paged:
                    res, more, total = self.md.store.search(query,
                                                            path=q,
                                                            page=int(page),
                                                            page_limit=int(page_limit),
                                                            entity_filter=entity_filter,
                                                            related=related)
                    # log.debug(dumps({'entities': res, 'more': more, 'total': total}))
                    return dumps({'entities': res, 'more': more, 'total': total})
                else:
                    return dumps(self.md.store.search(query,
                                                      path=q,
                                                      entity_filter=entity_filter,
                                                      related=related))
            elif accept.get('text/html'):
                if not q:
                    if pfx:
                        title = pfx
                    else:
                        title = _("Metadata By Attributes")
                    return render_template("index.html",
                                           md=self.md,
                                           samlmd=samlmd,
                                           alias=alias,
                                           aliases=self.aliases,
                                           title=title)
                else:
                    entities = self.md.lookup(q)
                    if not entities:
                        raise NotFound()
                    if len(entities) > 1:
                        return render_template("metadata.html",
                                               md=self.md,
                                               samlmd=samlmd,
                                               subheading=q,
                                               entities=entities)
                    else:
                        entity = entities[0]
                        return render_template("entity.html",
                                               headline=entity_display_name(entity),
                                               subheading=entity.get('entityID'),
                                               entity_id=entity.get('entityID'),
                                               samlmd=samlmd,
                                               entity=entity_info(entity))
            else:
                for p in self.plumbings:
                    state = {'request': request_type,
                             'headers': {'Content-Type': 'text/xml'},
                             'accept': accept,
                             'url': cherrypy.url(relative=False),
                             'select': q,
                             'path': path,
                             'stats': {}}
                    r = p.process(self.md, state=state)
                    if r is not None:
                        cache_ttl = state.get('cache', 0)
                        log.debug("caching for %d seconds" % cache_ttl)
                        for k, v in list(state.get('headers', {}).items()):
                            cherrypy.response.headers[k] = v
                        cherrypy.response.headers['Access-Control-Allow-Origin'] = '*'
                        caching.expires(secs=cache_ttl)
                        return r
        raise NotFound()
예제 #49
0
#!/usr/bin/python
#encoding:utf-8

from publicsuffix import PublicSuffixList

domainParser = PublicSuffixList()
# print domainParser.get_public_suffix("www.example.com.cn")
# print domainParser.get_public_suffix("www.example.com.uk")
# print domainParser.get_public_suffix("jaysonhwang.sinaapp.com")
# print domainParser.get_public_suffix("1.jaysonhwang.sinaapp.com")
# print domainParser.get_public_suffix("jaysonhwang.sinaapp.com/web/1")

print domainParser.get_domain("http://192.168.0.100:8080/web")
print domainParser.get_domain("http://www.qq.com")
allow = [
    "http://www.people.com.cn",
    "http://www.xinhuanet.com",
    "http://www.qq.com",
    "http://www.163.com",
    "http://www.cntv.cn",
    "http://www.ifeng.com",
    "http://www.hexun.com",
    "http://www.sina.com.cn",
    "http://www.sohu.com",
    "http://www.dbw.cn",]
for a in allow:
    print domainParser.get_domain(a)[0]
예제 #50
0
# -*- coding: utf-8 -*-
"""
Created on Fri Jun 27 11:57:34 2014

@author: Reed
"""

#script to find pathological URLs in URL list

##import re 
from publicsuffix import PublicSuffixList
from urllib.parse import urlparse
import os
import re
os.chdir("C://Users//Reed//Documents//Github//wikiplus1")
psl=PublicSuffixList()
with open("dat_wiki_urls.txt", 'r', encoding='utf-8') as infile:
    weirdurls_reg=[]
    weirdurls_arc=[]
    for line in infile:
        ##if len(re.findall("^//|^http://|^https://",line)) ==0:
            ##weirdurls.append(line)
    #check if potential web archive
        if re.search("//",line):
            slashsplit=line.split("http://")
            if len(slashsplit)<=2:
                #REMOVE PORT, ADD THIS TO SCRIPT PIPELINE
                dom=urlparse(line).netloc.split(':')[0]
                archive='NULL'
                tldn=psl.get_public_suffix(dom.rstrip())
                try:
예제 #51
0
class MDServer(object):
    """The MDServer class is the business logic of pyFF. This class is isolated from the request-decoding logic
    of MDRoot and from the ancilliary classes like MDStats and WellKnown.
    """

    def __init__(self, pipes=None, observers=None):

        if not observers:
            observers = []
        if not pipes:
            pipes = []
        self._pipes = pipes
        self.lock = ReadWriteLock()
        self.plumbings = [plumbing(v) for v in pipes]
        self.refresh = MDUpdate(cherrypy.engine, server=self, frequency=config.frequency)
        self.refresh.subscribe()
        self.aliases = config.aliases
        self.psl = PublicSuffixList()
        self.md = MDRepository(metadata_cache_enabled=config.caching_enabled, store=config.store)

        if config.autoreload:
            for f in pipes:
                cherrypy.engine.autoreload.files.add(f)

    @property
    def ready(self):
        return self.md.store.ready()

    def reload_pipeline(self):
        new_plumbings = [plumbing(v) for v in self._pipes]
        self.plumbings = new_plumbings

    class MediaAccept(object):

        def __init__(self):
            pass

        def has_key(self, key):
            return True

        def get(self, item):
            return self.__getitem__(item)

        def __getitem__(self, item):
            try:
                return cptools.accept(item, debug=True)
            except HTTPError:
                return False

    def request(self, **kwargs):
        """The main request processor. This code implements all rendering of metadata.
        """
        stats['MD Requests'] += 1

        if not self.ready:
            raise HTTPError(503, _("Service Unavailable (repository loading)"))

        pfx = kwargs.get('pfx', None)
        path = kwargs.get('path', None)
        content_type = kwargs.get('content_type', None)

        log.debug("MDServer pfx=%s, path=%s, content_type=%s" % (pfx, path, content_type))

        def _d(x, do_split=True):
            if x is not None:
                x = x.strip()
            log.debug("_d(%s,%s)" % (x, do_split))
            if x is None or len(x) == 0:
                return None, None

            if x.startswith("{base64}"):
                x = x[8:].decode('base64')

            if do_split and '.' in x:
                (pth, dot, extn) = x.rpartition('.')
                assert (dot == '.')
                if extn in _ctypes:
                    return pth, extn

            return x, None

        _ctypes = {'xml': 'application/xml',
                   'json': 'application/json',
                   'htm': 'text/html',
                   'html': 'text/html',
                   'ds': 'text/html',
                   's': 'application/json'}

        alias = None
        if pfx:
            alias = pfx
            pfx = self.aliases.get(alias, None)
            if pfx is None:
                raise NotFound()

        path, ext = _d(path, content_type is None)
        if pfx and path:
            q = "{%s}%s" % (pfx, path)
            path = "/%s/%s" % (alias, path)
        else:
            q = path

        if ext is not None:
            log.debug("request path: %s.%s, headers: %s" % (path, ext, cherrypy.request.headers))
        else:
            log.debug("request path: %s, headers: %s" % (path, cherrypy.request.headers))

        accept = {}
        if content_type is None:
            if ext is not None and ext in _ctypes:
                accept = {_ctypes[ext]: True}
            else:
                accept = MDServer.MediaAccept()
                if ext is not None:
                    path = "%s.%s" % (path, ext)
        else:
            accept = {content_type: True}

        with self.lock.readlock:
            if ext == 'ds':
                pdict = dict()
                entity_id = kwargs.get('entityID', None)
                if entity_id is None:
                    raise HTTPError(400, _("400 Bad Request - missing entityID"))
                pdict['sp'] = self.md.sha1_id(entity_id)
                e = self.md.store.lookup(entity_id)
                if e is None or len(e) == 0:
                    raise HTTPError(404)

                if len(e) > 1:
                    raise HTTPError(400, _("400 Bad Request - multiple matches for") + " %s" % entity_id)

                pdict['entity'] = self.md.simple_summary(e[0])
                if not path:
                    pdict['search'] = "/search/"
                    pdict['list'] = "/role/idp.json"
                else:
                    pdict['search'] = "%s.s" % path
                    pdict['list'] = "%s.json" % path
                cherrypy.response.headers['Content-Type'] = 'text/html'
                return render_template("ds.html", **pdict)
            elif ext == 's':
                paged = bool(kwargs.get('paged', False))
                query = kwargs.get('query', None)
                page = kwargs.get('page', 0)
                page_limit = kwargs.get('page_limit', 10)
                entity_filter = kwargs.get('entity_filter', None)
                related = kwargs.get('related', None)

                cherrypy.response.headers['Content-Type'] = 'application/json'

                if query is None:
                    log.debug("empty query - creating one")
                    query = [cherrypy.request.remote.ip]
                    referrer = cherrypy.request.headers.get('referrer', None)
                    if referrer is not None:
                        log.debug("including referrer: %s" % referrer)
                        url = urlparse.urlparse(referrer)
                        host = url.netloc
                        if ':' in url.netloc:
                            (host, port) = url.netloc.split(':')
                        for host_part in host.rstrip(self.psl.get_public_suffix(host)).split('.'):
                            if host_part is not None and len(host_part) > 0:
                                query.append(host_part)
                    log.debug("created query: %s" % ",".join(query))

                if paged:
                    res, more, total = self.md.search(query,
                                                      path=q,
                                                      page=int(page),
                                                      page_limit=int(page_limit),
                                                      entity_filter=entity_filter,
                                                      related=related)
                    # log.debug(dumps({'entities': res, 'more': more, 'total': total}))
                    return dumps({'entities': res, 'more': more, 'total': total})
                else:
                    return dumps(self.md.search(query,
                                                path=q,
                                                entity_filter=entity_filter,
                                                related=related))
            elif accept.get('text/html'):
                if not q:
                    if pfx:
                        title = pfx
                    else:
                        title = _("Metadata By Attributes")
                    return render_template("index.html",
                                           md=self.md,
                                           alias=alias,
                                           aliases=self.aliases,
                                           title=title)
                else:
                    entities = self.md.lookup(q)
                    if not entities:
                        raise NotFound()
                    if len(entities) > 1:
                        return render_template("metadata.html",
                                               md=self.md,
                                               subheading=q,
                                               entities=entities)
                    else:
                        entity = entities[0]
                        t = html.fragment_fromstring(unicode(xslt_transform(entity, "entity2html.xsl")))
                        for c_elt in t.findall(".//code[@role='entity']"):
                            c_txt = dumptree(entity)
                            parser = etree.XMLParser(remove_blank_text=True)
                            src = StringIO(c_txt)
                            tree = etree.parse(src, parser)
                            c_txt = dumptree(tree, pretty_print=True, xml_declaration=False).decode("utf-8")
                            p = c_elt.getparent()
                            p.remove(c_elt)
                            if p.text is not None:
                                p.text += c_txt
                            else:
                                p.text = c_txt
                        xml = dumptree(t, xml_declaration=False).decode('utf-8')
                        return render_template("entity.html",
                                               headline=self.md.display(entity).strip(),
                                               subheading=entity.get('entityID'),
                                               entity_id=entity.get('entityID'),
                                               content=xml)
            else:
                for p in self.plumbings:
                    state = {'request': True,
                             'headers': {'Content-Type': 'text/xml'},
                             'accept': accept,
                             'url': cherrypy.url(relative=False),
                             'select': q,
                             'path': path,
                             'stats': {}}
                    r = p.process(self.md, state=state)
                    if r is not None:
                        cache_ttl = state.get('cache', 0)
                        log.debug("caching for %d seconds" % cache_ttl)
                        for k, v in state.get('headers', {}).iteritems():
                            cherrypy.response.headers[k] = v
                        caching.expires(secs=cache_ttl)
                        return r
        raise NotFound()
예제 #52
0
    code=[]
    country=[]
    with open("countrycodenamelist.txt","r") as ccfile:
        for line in ccfile:
            byct=line.split("\t")
            code.append(byct[0])
            country.append(byct[1].rstrip()[2:])
    
    return [code,country]
            
        
#get parellel lists of country code top level domain and 
cclist=countrysufflist()

#initialize lists
psl=PublicSuffixList()
tldns=[] #domain full
doms=[] #domain base
suffs=[] # suffix
ct_codes=[] #last piece of TLD, potentially country code
ct_names=[] #name if valid
urlcount=[] #count of urls
majorDoms=[] #.com, .edu, .gov, .org (even with additional suffix), else NULL
error=[]
with open("dom_count_full.txt","rb") as funfile:    
    for line in funfile:
        line_list=line.decode('utf-8').split('\t')
        
        tldn=psl.get_public_suffix(line_list[0].rstrip())
        count=int(line_list[1])
        #split domain
예제 #53
0
class FeedFilter:
    """
        Feedfilter takes in the arguments from the command line,
        processes them, and passes them out to an appropriate filter.
    """

    def __init__(self, **kwargs):
        """ args - passed in by optparse """
        self.cc_filters = []
        self.asn_filters = []
        self.format = None
        self.has_header = False
        self.infile = None
        self.outfile = None
        self.verbose = False
        self.quiet = False

        self.matchers = {}
        self.repo = NetObjectRepo()

        # regexs are intentionally broad - we'll run more tests later.

        self.matchers["ip"] = {"rex": "(?:\d+\.){3}\d+", "chk_func": self._is_valid_ip, "type": "ip"}
        self.matchers["hostname"] = {
            "rex": "([a-zA-Z0-9\-\.]+\.[0-9a-zA-Z\-\.]+)(?:\d+)?",
            "chk_func": self._is_valid_domain,
            "type": "domain",
        }

        logging.info("Getting Public Suffix List")
        self.psl = PublicSuffixList(self._get_psl_file())
        logging.info("Got Public Suffix List")

        self.parse_args(**kwargs)

    def _get_psl_file(self):
        """ returns Public Suffix List as a list of lines in the PSL """
        url = "http://mxr.mozilla.org/mozilla-central/source/netwerk/dns/effective_tld_names.dat?raw=1"
        headers = {"cache-control": "max-age=%d" % (60 * 60 * 24 * 7)}
        http = httplib2.Http(tempfile.gettempdir())
        response, content = http.request(url, headers=headers)

        return content.split("\n")

    def parse_args(
        self,
        infile=sys.stdin,
        outfile=sys.stdout,
        verbose=False,
        verboser=False,
        quiet=False,
        has_header=False,
        format=None,
        filter=None,
    ):
        def create_stdin_temp_file():
            f = tempfile.NamedTemporaryFile()
            for line in sys.stdin.read():
                f.write(line)
            # TODO: according to docs, a second open won't work on Win
            return open(f.name, "r")

        self.outfile = outfile
        self.verbose = verbose
        self.quiet = quiet
        self.has_header = has_header

        level = logging.WARN

        # quiet overrides everything else
        if verbose:
            level = logging.INFO
        if verboser:
            level = logging.DEBUG
        if quiet:
            level = logging.ERROR

        logging.basicConfig(level=level, format="%(message)s")

        if not infile or infile.name == "<stdin>":
            self.infile = create_stdin_temp_file()
        else:
            self.infile = infile

        for filt in filter.split(","):
            for m in re.findall("^(?:AS)?(\d+)$", filt):
                self.asn_filters.append(m.upper())
            for m in re.findall("^[A-Za-z]+$", filt):
                self.cc_filters.append(m.upper())

        if len(self.asn_filters) == 0 and len(self.cc_filters) == 0:
            # raise ValueError, "You need to specify at least one valid TLD or ASN filter. e.g. AS254,JP,AU"
            sys.exit("You need to specify --filter with at least one valid TLD or ASN filter. e.g. AS254,JP,AU")

        logging.info("Using filters: ")
        if self.asn_filters:
            logging.info("  ASN: %s" % (", ".join(self.asn_filters)))
        if self.cc_filters:
            logging.info("  Country codes: %s" % (", ".join(self.cc_filters)))

    def domains_to_ips(self):
        ar = AsyncResolver([domain_data["domain"] for domain_data in self.repo.get_domain_data()])
        resolved = ar.resolve()

        for host, ips in resolved.items():
            if ips is None:
                logging.debug("%s could not be resolved." % host)
            else:
                self.repo.add_domain_ips(host, ips)

    def extract_matches(self):
        self.infile.seek(0)
        for linenum, line in enumerate(self.infile.readlines()):
            # no need to parse a header line
            if self.has_header and linenum == 0:
                pass
            for (match_type, match) in self.get_line_matches(line, linenum):
                # self.repo.add(match_type, match)
                yield (match_type, match)

    def extract_and_store_matches(self):
        for match_type, match in self.extract_matches():
            self.repo.add(match_type, match)

    def get_filtered_lines(self):
        self.infile.seek(0)

        for linenum, line in enumerate(self.infile.readlines()):
            if self.has_header and linenum == 0:
                yield (line)
            else:
                for match_type, match in self.get_line_matches(line, linenum):
                    if self.repo.belongs_to(
                        datatype=match_type, data=match, asn_filters=self.asn_filters, cc_filters=self.cc_filters
                    ):
                        yield (line)
                        logging.debug("'%s' matches filter %s", match, match_type)
                        break

    def output_matches(self):
        for line in self.get_filtered_lines():
            self.outfile.write(line)

    def get_line_matches(self, line, line_num, fetch_only_one=False):
        try:
            match = False
            for m_key, m_dict in self.matchers.items():
                if "chk_func" in m_dict and "rex" in m_dict:
                    for m in re.findall(m_dict["rex"], line):
                        if m_dict["chk_func"](m):
                            match = True
                            logging.debug("matched '%s' as %s" % (m, m_key))
                            yield ((m_dict["type"], m))
                        if match and fetch_only_one:
                            break
                elif "chk_func" in m_dict and m_dict["chk_func"](line):
                    match = True
                    yield ((m_dict["type"], line))
                elif "rex" in m_dict:
                    for m in re.findall(m_dict["rex"], line):
                        match = True
                        yield ((m_dict["type"], m))
                if match and fetch_only_one:
                    break
        except csv.Error:
            logging.warn("Error parsing line %d, skipping" % line_num)

    def _is_valid_domain(self, domain):
        if not str(domain):
            return None
        # don't want / need to resolve IPs
        elif self._is_valid_ip(domain):
            return None
        else:
            # using this PSL, known TLDs return at least one .
            return self.get_tld(domain).find(".") >= 0

    def get_tld(self, domain):
        suffix = self.psl.get_public_suffix(domain)
        logging.debug("Domain fetched: %s", suffix)
        return suffix

    def _is_valid_ip(self, ip):
        for family in (socket.AF_INET, socket.AF_INET6):
            try:
                socket.inet_pton(family, ip)
            except Exception:
                pass
            else:
                return True
        return False

    def add_asn_cc_info(self):
        def asn_lookup():
            bw = BulkWhoisCymru()
            ip_list = []
            for ip_data in self.repo.get_ip_data():
                ip_list.append(str(ip_data["ip"]))
            return bw.lookup_ips(ip_list)

        asn_info = asn_lookup()

        for ip_data in self.repo.get_ip_data():
            if ip_data["ip"] in asn_info:
                ip = ip_data["ip"]
                self.repo.add_ip_asn_cc(ip, asn=asn_info[ip]["asn"], cc=asn_info[ip]["cc"])

    def add_domain_ccs(self):
        for domain_data in self.repo.get_domain_data():
            tld = self.get_tld(domain_data["domain"])
            if tld:
                self.repo.add_domain_cc(domain_data["domain"], cc=(tld.split(".")[-1]))

    def process_file(self):
        stime = time.time()
        logging.info("Extracting matches")
        self.extract_and_store_matches()
        logging.debug("Got matches " + str(time.time() - stime))
        if self.repo.get_domain_count() > 0:
            logging.info("Resolving " + str(self.repo.get_domain_count()) + " unique domains")
            self.domains_to_ips()
            logging.debug("Resolved IPs " + str(time.time() - stime))
            logging.info("Looking up ASNs")
        if self.repo.get_ip_count() > 0:
            self.add_asn_cc_info()
            logging.debug("Got asns " + str(time.time() - stime))
            logging.info("Getting domain CCs")
        self.add_domain_ccs()
        logging.debug("Added domain ccs " + str(time.time() - stime))
        self.repo.dump()
예제 #54
0
class Worker(object):
    def __init__(self, whois_sleep_seconds=10, nameservers=['8.8.8.8', '8.8.4.4'], log_filename=None):
        ''' initialize a worker object to do work
        '''
        self.psl = PublicSuffixList()
        self.whois_sleep_seconds = whois_sleep_seconds
        self.my_resolver = dns.resolver.Resolver()
        self.my_resolver.nameservers = nameservers

        with open(os.path.join(os.path.dirname(__file__), '../aux/whois_server_ips')) as f:
            self.whois_server_ips = json.load(f)

        self.logger = logging.getLogger('snapshooter_worker')
        self.logger.setLevel(logging.DEBUG)
        ch = logging.StreamHandler()
        self.logger.addHandler(ch)
        if log_filename:
            fh = logging.FileHandler(log_filename)
            self.logger.addHandler(fh)
           

    def _datetime_list_to_str(self, dt):
        ''' convert a list of datetimes to a list of strings
        '''
        return [str(each) for each in dt]


    def _datetime_to_str(self, dt):
        ''' convert a datetimes to a string
        '''
        return str(dt)
        

    def _log_this(self, domain, msg):
        ''' log a message
        '''
        self.logger.debug("%s\t%s\t%s\t%s" % (self.get_now(), "snapshooter_worker", domain, msg))


    def get_now(self):
        ''' get the time right now, a good candidate for testing RPC
        '''
        return str(datetime.datetime.now()).replace(' ', '_')


    def get_whois(self, domain, tld, retries=3, queried_servers=set(), remaining_servers=set()):
        ''' given a domain, find whois information in a semi-intelligent way
            using the TLD to server IP list in aux, rotate between whois server IPs authoritative for the TLD
            if all the IPs are throttling us, sleep and try again (sleeping decrements retries)
        '''
        self._log_this(domain, 'received whois call')
        # base case
        if retries < 1:
            self._log_this(domain, 'whois failed all time, bailing')
            return {}

        tld = '.' + tld.strip('.')

        # we know a set of IPs responsible for whois info for this tld, so we try to rotate between them 
        if tld in self.whois_server_ips:
            self._log_this(domain, 'tld found in whois_server_ips') 
            # this is the first iteration 
            if len(queried_servers) == 0 and len(remaining_servers) == 0:
                remaining_servers.update([ip for hostname in self.whois_server_ips[tld] for ip in self.whois_server_ips[tld][hostname]])
                self._log_this(domain, 'iterating over the following whois servers: %s' % (remaining_servers))

            # we've queried all the servers we can and now need to try sleeping
            if len(remaining_servers) == 0 and len(queried_servers) > 0:
                self._log_this(domain, 'querying whois with no specified server')
                try:
                    w = pythonwhois.get_whois(domain)
                except:
                    sys.stderr.write('domain: %s whois returned no results retries remaining: %d\n' % (domain, retries)) 
                    time.sleep(self.whois_sleep_seconds)
                    return self.get_whois(domain, tld, retries=retries-1)
            # remaining servers exist, let's try querying them before trying sleep
            else:
                server = random.sample(remaining_servers, 1)[0]
                queried_servers.add(server)
                remaining_servers.remove(server)
                self._log_this(domain, 'querying whois with specific server: %s' % (server))
                try:
                    w = pythonwhois.parse.parse_raw_whois(pythonwhois.net.get_whois_raw(domain, server=server))
                except:
                    sys.stderr.write('domain: %s whois returned no results from server: %s, retries remaining: %d\n' % (domain, server, retries)) 
                    # NO SLEEP
                    return self.get_whois(domain, tld, retries=retries, remaining_servers=remaining_servers, queried_servers=queried_servers)
        # the tld is not in our whois server list and we must use sleep to avoid being throttled
        else:
            self._log_this(domain, 'querying whois with no specified server')
            try:
                w = pythonwhois.get_whois(domain)
            except:
                sys.stderr.write('domain: %s whois returned no results retries remaining: %d\n' % (domain, retries)) 
                time.sleep(self.whois_sleep_seconds)
                return self.get_whois(domain, tld, retries=retries-1)
            
        # once we have a response...
        # messagepack (used by zerorpc) can't serialize datetime objects, so we make them strings :\
        for date in ('expiration_date', 'creation_date', 'updated_date', 'changedate'):
            if date in w:
                w[date] = self._datetime_list_to_str(w[date])
            for category in ('registrant', 'tech', 'billing', 'admin'):
                if ('contacts' in w) and (category in w['contacts']) and (w['contacts'][category] is not None) and (date in w['contacts'][category]):
                    w['contacts'][category][date] = self._datetime_to_str(w['contacts'][category][date])
        return w


    def get_asn(self, ip, domain):
        ''' given an IP address as a string, return Team Cymru ASN lookup results            
            radata example:    "33667 | 98.239.64.0/18 | US | arin | 2007-04-20"
        '''
        self._log_this(domain, 'received asn call %s' % (ip))
        return None
        #
        # This is disabled to avoid hammering TC's IP to ASN service. 
        # Enable at your own discretion.
        #
        #####################
        #try:
        #    ans = self.my_resolver.query(str(ip) + ".origin.asn.cymru.com", "TXT")
        #    for rdata in ans:
        #        result = [str(rdata).strip().strip('"') for rdata in str(rdata).split('|')]
        #        return result
        #except:
        #    return None
        #####################

    
    def get_ipv4s(self, domain):
        ''' given a domain, return a set of IPv4 information
            - ip2asn is used to reduce lookups to get_asn for the same IP
            - all the for loops and DNS queries might be confusing, so let's explain it:
                1. have 8.8.8.8 locate the NS records for the name we are interested in (find the authoritative name servers)
                2. locate the A records for those NS records (find the IPs the authoritative domains resolve to)
                3. send queries to those name server IPs for the A records of the original name (query the authoritative systems for the names' IP addresses)
            - this is similar to a 'dig +trace [name]' command, but lets Google recurse for the answer a bit and doesn't go directly to the roots
            - doing it this way give us the authoritative TTL for the name
            - there is risk here of an attacker controlled name server providing incorrect or poisoned responses to workers
        '''
        self._log_this(domain, 'received ipv4s call %s')
        ip2asn = {}
        ipv4s = []
        self.my_resolver.nameservers = ['8.8.8.8', '8.8.4.4']
        try:
            ns_names = self.my_resolver.query(domain, "NS")
            for ns_name in ns_names.rrset:
                self._log_this(domain, 'NS record: %s found' % (ns_name))
                ns_ips = self.my_resolver.query(str(ns_name), "A")
                for ns_ip in ns_ips.rrset:
                    self._log_this(domain, 'A record: %s, found for name %s' % (ns_ip, ns_name))
                    self.my_resolver.nameservers = [str(ns_ip)]
                    ans = self.my_resolver.query(domain, "A")
                    for rdata in ans.rrset:
                        ip_address = str(rdata)
                        self._log_this(domain, 'A record: %s found' % (ip_address))
                        ip_ttl = ans.rrset.ttl
                        try:
                            if ip_address in ip2asn.keys():
                                self._log_this(domain, 'using cached results for IP: %s' % (ip_address))
                                asn, ip_prefix, _, _, _ = ip2asn[ip_address]
                                ip_asn = int(asn)
                            else:
                                self._log_this(domain, 'attempting ASN lookup for IP: %s' % (ip_address))  
                                asn, ip_prefix, _, _, _ = self.get_asn(domain, ip_address)
                                ip2asn[ip_address] = asn, ip_prefix, '', '', ''
                                ip_asn = int(asn)
                        except:
                            self._log_this(domain, 'failed lookup for IP: %s' % (ip_address))
                            ip_prefix = ''
                            ip_country = ''
                            ip_asn = 0
                            ip2asn[ip_address] = ip_asn, ip_prefix, ip_country, '', ''
                        ipv4s.append({'ip_address': str(ip_address), 
                                      'ip_ttl': ip_ttl, 'ns_name': str(ns_name), 
                                      'ns_ip': str(ns_ip), 'asn': ip_asn, 
                                      'ip_prefix': ip_prefix, 'ip_country': ip_country})
        except:
            pass
        return ipv4s


    def get_authoritative_domains(self, domain, nameservers):
        ''' given a domain, return information about the domain's authoritative domain
        '''
        self._log_this(domain, 'received auth call')
        auths = list()
        for server in nameservers:
            d = self.psl.get_public_suffix(server).split('.')[0].strip('.').lower()
            tld = self.psl.get_public_suffix(server).replace(d, '', 1).strip('.').lower()
            subs = server.replace('.'.join(['', d, tld]), '', 1).lower()
            self._log_this(domain, 'subs: %s, d: %s, tld: %s' % (subs, d, tld))
            w = self.get_whois("%s.%s" % (d, tld), tld)
            auths.append({'tld': tld, 'domain': d, 'subs': subs, 'whois': w})
        return auths


    def get_domain(self, domain):
        ''' given a domain, find:
            - IPv4s and TTLs
            - whois information
            - nameserver:
                - hostnames
                - IPv4s
                - whois information
        '''
        self._log_this(domain, 'received domain call')
        publicsuffix = self.psl.get_public_suffix(domain)
        dn_domain = publicsuffix.split('.')[0]
        dn_tld = '.'.join(publicsuffix.split('.')[1:])
        dn_subs = domain.replace('.'.join([dn_domain, dn_tld]), '', 1).split('.')
        w = self.get_whois(domain, dn_tld)
        dn_ips = self.get_ipv4s(domain)
        # try the nameserver found by DNS queries, then the nameservers from whois, then just skip it
        try: 
            nameservers = set([each['ns_name'] for each in dn_ips])
        except:
            try:
                nameserver = set(w['nameservers'])
            except:
                nameservers = set([])
        dn_authorities = self.get_authoritative_domains(domain, nameservers)
        self._log_this(domain, 'sending results')
        now = self.get_now()
        return {'ts': now, 'tld': dn_tld, 'domain': dn_domain, 
                'subs': dn_subs, 'ips': dn_ips, 'whois': w, 
                'request': domain, 'authorities': dn_authorities}