def validate_non_public_suffix(value): """ check that value is not a public suffix """ psl = PublicSuffixList() if psl.get_public_suffix(value) != psl.get_public_suffix("x." + value): raise ValidationError(_("You cannot register public suffixes"))
def validate_non_public_suffix(value): ''' check that value is not a public suffix ''' psl = PublicSuffixList() if psl.get_public_suffix(value) != psl.get_public_suffix('x.' + value): raise ValidationError(_('You cannot register public suffixes'))
def run(self): main_domain = _get_maindomain() all_domains = domain_list()["domains"] for domain in all_domains: self.logger_debug("Diagnosing DNS conf for %s" % domain) is_subdomain = domain.split(".", 1)[1] in all_domains for report in self.check_domain(domain, domain == main_domain, is_subdomain=is_subdomain): yield report # Check if a domain buy by the user will expire soon psl = PublicSuffixList() domains_from_registrar = [ psl.get_public_suffix(domain) for domain in all_domains ] domains_from_registrar = [ domain for domain in domains_from_registrar if "." in domain ] domains_from_registrar = set(domains_from_registrar) - set( YNH_DYNDNS_DOMAINS + ["netlib.re"]) for report in self.check_expiration_date(domains_from_registrar): yield report
def __init__(self, pipes=None, autoreload=False, frequency=600, aliases=None, cache_enabled=True, observers=None, store=None): if aliases is None: aliases = ATTRS if not observers: observers = [] if not pipes: pipes = [] self._pipes = pipes self.cache_enabled = cache_enabled self.lock = ReadWriteLock() self.plumbings = [plumbing(v) for v in pipes] self.refresh = MDUpdate(cherrypy.engine, server=self, frequency=frequency) self.refresh.subscribe() self.aliases = aliases self.psl = PublicSuffixList() self.md = MDRepository(metadata_cache_enabled=self.cache_enabled, store=store) if autoreload: for f in pipes: cherrypy.engine.autoreload.files.add(f)
def receiver_policy(host): # type: (str) -> ReceiverPolicy '''Get the DMARC receiver policy for a host. :param str host: The host to lookup. :returns: The DMARC reciever policy for the host. :rtype: A member of the :class:`gs.dmarc.ReceiverPolicy` enumeration. The :func:`receiver_policy` function looks up the DMARC reciever polciy for ``host``. If the host does not have a pubished policy `the organizational domain`_ is determined. The DMARC policy for the organizational domain is queried, and the subdomain policy is reuturned (if specified) or the overall policy for the domain is returned. Internally the :func:`gs.dmarc.lookup.lookup_receiver_policy` is used to perform the query. .. _the organizational domain: http://tools.ietf.org/html/draft-kucherawy-dmarc-base-04#section-3.2''' hostSansDmarc = host if host[:7] != '_dmarc.' else host[7:] retval = lookup_receiver_policy(hostSansDmarc) if retval == ReceiverPolicy.noDmarc: fn = get_suffix_list_file_name() with open(fn) as suffixList: psl = PublicSuffixList(suffixList) newHost = psl.get_public_suffix(hostSansDmarc) retval = lookup_receiver_policy(newHost, policyTag='sp') return retval
def receiver_policy(host): '''Get the DMARC receiver policy for a host. :param str host: The host to lookup. :returns: The DMARC reciever policy for the host. :rtype: A member of the :class:`gs.dmarc.ReceiverPolicy` enumeration. The :func:`receiver_policy` function looks up the DMARC reciever polciy for ``host``. If the host does not have a pubished policy `the organizational domain`_ is determined and the DMARC policy for this is returned. Internally the :func:`gs.dmarc.lookup.lookup_receiver_policy` is used to perform the query. .. _the organizational domain: http://tools.ietf.org/html/draft-kucherawy-dmarc-base-04#section-3.2''' hostSansDmarc = host if host[:7] != '_dmarc.' else host[7:] retval = lookup_receiver_policy(hostSansDmarc) if retval == ReceiverPolicy.noDmarc: # TODO: automatically update the suffix list data file # <https://publicsuffix.org/list/effective_tld_names.dat> fn = get_suffix_list_file_name() with open(fn, 'r') as suffixList: psl = PublicSuffixList(suffixList) newHost = psl.get_public_suffix(hostSansDmarc) # TODO: Look up the subdomain policy retval = lookup_receiver_policy(newHost) return retval
def normdomain(domain): global _psl if _psl is None: _psl=PublicSuffixList(open(ssl_config.psl_filename, encoding='utf-8')) suf=_psl.get_public_suffix(domain) return domain if domain==suf else '*.%s'%suf
def get_exact_domain(url): psl = PublicSuffixList() url = url.strip() u = urlparse(url) h = u.hostname if not h: h = url return psl.get_public_suffix(h)
def getPublicSuffixDomain(host): if not host: return "other" psl = PublicSuffixList() domain = psl.get_public_suffix(host) if "." not in domain: domain = "other" return domain
def normdomain(domain): global _psl if _psl is None: _psl = PublicSuffixList(open(ssl_config.psl_filename, encoding='utf-8')) suf = _psl.get_public_suffix(domain) return domain if domain == suf else '*.%s' % suf
def get_dmarc_record(hostname, use_cache=True, cache_expiration=DEFAULT_EXPIRATION_TIME): """Return a DMARC record for a given hostname. This will query the DNS records for a hostname, returning a parsed version of the DMARC record, if found. If a record could not be found for the hostname, the organizational domain will be used instead (which is generally example.com for foo.bar.example.com, but this depends on the domain in question). By default, the fetched record from DNS is cached, allowing this to be called multiple times without repeated DNS queries. This is optional, as is the expiration time for the cached data (which defaults to 1 month). Args: hostname (unicode): The hostname to look up the DMARC information from. use_cache (bool, optional): Whether to use the cache for looking up and storing record data. cache_expiration (int, optional): The expiration time for cached data. Returns: DmarcRecord: The DMARC record. If it could not be found, ``None`` will be returned instead. """ record = _fetch_dmarc_record(hostname=hostname, use_cache=use_cache, cache_expiration=cache_expiration) if not record: # We need to fetch from the Organizational Domain for the hostname # provided. For this, we need to look up from a Public Suffix list. # The third-party module 'publicsuffix' will help us find that # domain, in combination with a data file we must ship. filename = 'mail/public_suffix_list.dat' try: stream = pkg_resources.resource_stream('djblets', filename) reader = codecs.getreader('utf-8') psl = PublicSuffixList(reader(stream)) except IOError as e: logger.error('Unable to read public domain suffix list file ' '"%s" from Djblets package: %s', filename, e) else: new_hostname = psl.get_public_suffix(hostname) if new_hostname != hostname: record = _fetch_dmarc_record(hostname=new_hostname, use_cache=use_cache, cache_expiration=cache_expiration) return record
def normdomain(domain): from publicsuffix import PublicSuffixList global _psl if _psl is None: print('ssl: loading public suffix list') _psl=PublicSuffixList(open(ssl_config.psl_filename, encoding='utf-8')) suf=_psl.get_public_suffix(domain) return domain if domain==suf else '*.%s'%suf
def isHaveLongDigit(url): hostname = PublicSuffixList().get_public_suffix(url) m = re.match(r'\d*',hostname.split('.')[0]) if m != None: if len(m.group()) >6: return 1 return 0
def get_exact_domain( args ): ret = [] psl = PublicSuffixList() for url in args: url = url.strip() u = urlparse(url) h = u.hostname if not h: h = url ret.append(psl.get_public_suffix(h)) return ret
def normdomain(domain): from publicsuffix import PublicSuffixList global _psl if _psl is None: print('utils: loading public suffix list') _psl = PublicSuffixList(open(ssl_config.psl_filename, encoding='utf-8')) suf = _psl.get_public_suffix(domain) return domain if domain == suf else '*.%s' % suf
def __init__(self): self.tlds = self._load_tlds() self.alexa_top = self._load_alexa_whitelist() self.psl = PublicSuffixList(input_file=codecs.open( GENERATOR_CONFIG['tld_names_file'], "r", "utf8")) self.homoglyphs_confusables = self.loadconfusables() self.keyboards = [ GENERATOR_CONFIG['qwerty'], GENERATOR_CONFIG['qwertz'], GENERATOR_CONFIG['azerty'] ] self.homoglyphs = GENERATOR_CONFIG['homoglyphs'] self.typo_domains = list()
def __init__(self, **kwargs): """ args - passed in by optparse """ self.cc_filters = [] self.asn_filters = [] self.format = None self.has_header = False self.infile = None self.outfile = None self.verbose = False self.quiet = False self.matchers = {} self.repo = NetObjectRepo() # regexs are intentionally broad - we'll run more tests later. self.matchers["ip"] = {"rex": "(?:\d+\.){3}\d+", "chk_func": self._is_valid_ip, "type": "ip"} self.matchers["hostname"] = { "rex": "([a-zA-Z0-9\-\.]+\.[0-9a-zA-Z\-\.]+)(?:\d+)?", "chk_func": self._is_valid_domain, "type": "domain", } logging.info("Getting Public Suffix List") self.psl = PublicSuffixList(self._get_psl_file()) logging.info("Got Public Suffix List") self.parse_args(**kwargs)
class Streamifier(BaseStreamifier): """ Use the Public Suffix List <http://publicsuffix.org> to split the messages into streams, one per direction per suffix. """ def __init__(self, procs): BaseStreamifier.__init__(self, procs) self.psl = PublicSuffixList() def streamify(self, messages): """ Given a list of messages (each a req, res tuple), return a list of Stream objects. """ reqs = defaultdict(list) ress = defaultdict(list) suffixes = [] for req, res in messages: host = req[':host'] suffix = self.psl.get_public_suffix(host.split(":", 1)[0]) if suffix not in suffixes: suffixes.append(suffix) reqs[suffix].append((req, host)) ress[suffix].append((res, host)) streams = [] for suffix in suffixes: streams.append(Stream(suffix, reqs[suffix], 'req', self.procs)) streams.append(Stream(suffix, ress[suffix], 'res', self.procs)) return streams
def domain_parser(df, field_list=[ "index", "url", "netloc", "domain", "site_name", "pub_suff", "country_code", "country", "major_dom" ]): #check if valid url, if so convert to unicode string, else mark as invalid, temporarily add to input DataFrame url = df.apply(link_clean, axis=1) url.name = 'url' urldf = pd.DataFrame(url) df['url'] = urldf #get netlocs nts = df.apply(netloc_grabber, axis=1) nts.name = 'netloc' #name column #get country code list and public suffix list cclist = countrysufflist() psl = PublicSuffixList() #get dom data from netlocs domdata = nts.apply(dom_details_grabber, args=(psl, cclist)) #get index as dataFrame and optional column original_ind = df.index original_ind.name = 'index' original_ind = pd.DataFrame(original_ind) #merge data (wiki id, ) mergedf = pd.concat([ original_ind, urldf, pd.DataFrame(nts), pd.DataFrame(domdata.tolist()) ], axis=1) return mergedf[field_list]
class HTTPAuthority(Authority): suffix_list = PublicSuffixList() def __init__(self, user_info_obj, host_str, port_int): Authority.__init__(self, user_info_obj, host_str, port_int) def getDomainUnencoded(self): host_unencoded_str = self.getHostUnencodedStr() is_ip_address = HTTPAuthority._isIPAddress(host_unencoded_str) result = None if is_ip_address == False: result = HTTPAuthority.suffix_list.get_public_suffix(host_unencoded_str) return result @staticmethod def _isIPAddress(host_unencoded_str): IPV4ADDRESS_RE = uri_parser.IPV4ADDRESS_FRAG_RE IP_LITERAL_RE = uri_parser.IP_LITERAL_FRAG_RE IP_ADDRESS_RE = r"^(?:(?:" + IPV4ADDRESS_RE + r")|(?:" + IP_LITERAL_RE + r"))$" regex = re.compile(IP_ADDRESS_RE) m = regex.match(host_unencoded_str) is_ip_address = m != None return is_ip_address def copy(self): host_unencoded_str = self.getHostUnencodedStr() port_int = self.getPortInt() next_user_info_obj = self._getUserInfoCopy() http_authority = HTTPAuthority(next_user_info_obj, host_unencoded_str, port_int) return http_authority
def load_suffix_list(): if SUFFIX_CACHE and os.path.exists(SUFFIX_CACHE): logging.debug("Using cached suffix list.") cache_file = codecs.open(SUFFIX_CACHE, encoding='utf-8') suffixes = PublicSuffixList(cache_file) else: # File does not exist, download current list and cache it at given location. logging.debug("Downloading the Public Suffix List...") cache_file = fetch() content = cache_file.readlines() suffixes = PublicSuffixList(content) if SUFFIX_CACHE: logging.debug("Caching suffix list at %s" % SUFFIX_CACHE) utils.write(''.join(content), SUFFIX_CACHE) return suffixes
def __init__(self, manager, args, mb_stream, states_context): self.logger = logging.getLogger("discovery") backend = manager.backend self.domain_cache = DomainCacheProxyWeb(backend.domain_metadata) try: psl_file = codecs.open("public_suffix_list.dat", encoding='utf8') except IOError: self.logger.exception("Please get the public suffix file from https://publicsuffix.org/") raise self._suffix_list = PublicSuffixList(psl_file) self._states_ctx = states_context self.states = backend.states self.user_agent = to_native_str(manager.settings.get('USER_AGENT')) self.max_pages = int(manager.settings.get('DISCOVERY_MAX_PAGES')) super(Discovery, self).__init__(manager, args, mb_stream, states_context)
def __init__(self, host, port=0, auth=None, use_ssl=False, starttls=False, prefix="noreply"): self._host = host self._port = port auth = auth or {} self._auth_user = auth.get('user') self._auth_password = auth.get('password') self._use_ssl = use_ssl self._starttls = starttls self.psl_file = publicsuffix.fetch() self.psl = PublicSuffixList(self.psl_file) self.prefix = prefix
def __init__(self, whois_sleep_seconds=10, nameservers=['8.8.8.8', '8.8.4.4'], log_filename=None): ''' initialize a worker object to do work ''' self.psl = PublicSuffixList() self.whois_sleep_seconds = whois_sleep_seconds self.my_resolver = dns.resolver.Resolver() self.my_resolver.nameservers = nameservers with open(os.path.join(os.path.dirname(__file__), '../aux/whois_server_ips')) as f: self.whois_server_ips = json.load(f) self.logger = logging.getLogger('snapshooter_worker') self.logger.setLevel(logging.DEBUG) ch = logging.StreamHandler() self.logger.addHandler(ch) if log_filename: fh = logging.FileHandler(log_filename) self.logger.addHandler(fh)
def __init__(self, pipes=None, observers=None): if not observers: observers = [] if not pipes: pipes = [] self._pipes = pipes self.lock = ReadWriteLock() self.plumbings = [plumbing(v) for v in pipes] self.refresh = MDUpdate(cherrypy.engine, server=self, frequency=config.frequency) self.refresh.subscribe() self.aliases = config.aliases self.psl = PublicSuffixList() self.md = MDRepository(metadata_cache_enabled=config.caching_enabled, store=config.store) if config.autoreload: for f in pipes: cherrypy.engine.autoreload.files.add(f)
def main(): ip_hostname_map = extract_host() # Get ip::hostname map try: con = lite.connect('router-naming.db') # Connecting to db cur = con.cursor() #cur.execute("DROP TABLE IF EXISTS NodeHost") # Creating tables #cur.execute("DROP TABLE IF EXISTS NodeDegree") cur.execute("CREATE TABLE NodeHost(IP TEXT, PublicSuffix TEXT, HostName TEXT, NodeID TEXT, NID INT)") cur.execute("CREATE TABLE NodeDegree(IpDegree INT, HostNameDegree INT, NodeID TEXT, NID INT)") p = PublicSuffixList() # Stripping public suffix domain node_file = open('data_set/midar-iff.nodes', 'r') # Read in node file (midar-iff.nodes) node_file_data = node_file.readlines() for row in node_file_data: # Go through all nodes if row[0] != "#": # Ignore comments node_id = re.search(r'N\d*', row).group() # Find node id in form of N<digits> node_id_int = node_id[1:] match_ip = re.split(r'node\sN\d*:\s*', row) # Find list of ip addr of this node ip_list = match_ip[1].split() hit_count = 0 # Count ips found in ip::hostname map for ip in ip_list: # Check all ips of this node if ip in ip_hostname_map: hit_count += 1 hostname = ip_hostname_map[ip] public_sfx = p.get_public_suffix(hostname) cur.execute(r"INSERT INTO NodeHost VALUES('%s', '%s', '%s', '%s', '%s');" % (ip, public_sfx, hostname, node_id, node_id_int)) if hit_count > 0: # If there is at least one hit cur.execute(r"INSERT INTO NodeDegree VALUES(%d, %d, '%s', '%s');" % (len(ip_list), hit_count, node_id, node_id_int)) con.commit() except lite.Error, e: if con: con.rollback() print "Databse error %s:" % e.args[0] sys.exit(1)
def get_psl(location=PSL_CACHE_LOC): """ Grabs an updated public suffix list. """ if not os.path.isfile(location): psl_file = fetch() with codecs.open(location, 'w', encoding='utf8') as f: f.write(psl_file.read()) psl_cache = codecs.open(location, encoding='utf8') return PublicSuffixList(psl_cache)
def get_psl(): """ Grabs an updated public suffix list. """ if not os.path.isfile(PSL_CACHE_LOC): print "%s does not exist, downloading a copy." % PSL_CACHE_LOC psl_file = fetch() with codecs.open(PSL_CACHE_LOC, 'w', encoding='utf8') as f: f.write(psl_file.read()) psl_cache = codecs.open(PSL_CACHE_LOC, encoding='utf8') return PublicSuffixList(psl_cache)
def main(): desc="Glenn's Firelamb: This tool will parse pcap files or listen on an interface for cookies. Cookies get saved to a Firefox cookies.sqlite file - one cookie file per observed device. ([email protected])" parser=OptionParser(description=desc) parser.add_option("-f", "--file", dest="fname",help="Specify pcap file to read") parser.add_option("-i", "--interface", dest="iface",help="Specify interface to listen on") parser.add_option("-p", "--ip_logging",action="store_true",dest="log_by_ip",default=False,help="Create cookie file per IP address. Default is per device MAC address") parser.add_option("-l", "--launch_firefox",dest="launch_ff",action = "store_true",default=False,help="Launch Firefox profiles for the saved cookies") parser.add_option("-s", "--karma_sslstrip",dest="sslstriplog",default=None,help="SSLStrip log file") parser.add_option("-t", "--karma_sslsplit",dest="sslsplitdir",default=None,help="Directory of SSLSplit log files") sqlv=sqlite3.sqlite_version.split('.') if (sqlv[0] <3 or sqlv[1] < 7): print "MANA (FireLamb) : [!] WARNING. sqlite3 version 3.7 or greater required. You have version %s.I'll try continue, but will likely not be able to write Firefox cookie files." %sqlite3.sqlite_version global psl global ip_logging psl = PublicSuffixList() (options, args) = parser.parse_args() ip_logging=options.log_by_ip if( not options.fname and not options.iface and not options.launch_ff): print parser.print_help() exit(-1) if(options.launch_ff): if (options.sslsplitdir): parsesslsplit(options.sslsplitdir) launch_firefox() else: if not os.path.exists(save_dir): os.makedirs(save_dir) print "MANA (FireLamb) : [+] Saving output to %s" %save_dir if(options.iface): print "MANA (FireLamb) : [+] Listening for cookie traffic on interface %s" %options.iface sniff(iface=options.iface,prn=process) elif(options.fname): print "MANA (FireLamb) : [+] Reading pcap file '%s'...." %options.fname packets=rdpcap(options.fname) print "MANA (FireLamb) : [+] Processing file contents..." for p in packets: process(p) print "MANA (FireLamb) : [+] Done."
def initPublicSuffixList(self): if self.psl is not None: return self.psl try: if fileExists(self.pslCachePath) and getLastModifiedTimeSpent(self.pslCachePath, TIMESPENT_UNIT.DAYS) < self.timeSpentMax: pslFile = codecs.open(self.pslCachePath, encoding='utf8') self.psl = PublicSuffixList(pslFile) pslFile.close() return self.psl else: (dir, filename, ext, filenameExt) = decomposePath(self.pslCachePath) mkdirIfNotExists(dir) pslData = list(publicsuffix.fetch()) removeIfExists(self.pslCachePath) strToFile(pslData, self.pslCachePath) self.psl = PublicSuffixList(pslData) return self.psl except Exception as e: logException(e, self, location="initPublicSuffixList") return None
def __init__(self, **kwargs): self.drone = kwargs.get('drone', "no_drone_name_supplied") self.verb = kwargs.get('verbose', 0) self.fname = os.path.splitext( os.path.basename(os.path.basename(__file__)))[0] self.psl = PublicSuffixList() self.cookies = fifoDict(names=("drone", "client_mac", "client_ip", "host", "name", "value", "baseDomain", "address", "lastAccessed", "creationTime")) self.userAgents = fifoDict(names=("mac", "userAgent"))
def load_suffix_list(): # File does not exist, download current list and cache it at given location. utils.debug("Downloading the Public Suffix List...", divider=True) try: cache_file = fetch() except URLError as err: logging.warn("Unable to download the Public Suffix List...") utils.debug("{}".format(err)) return [] content = cache_file.readlines() suffixes = PublicSuffixList(content) return suffixes, content
def domain_split(server_domain): ''''' server_domain为网站所用服务名+域名 分割域名, 得到前缀(服务名)、主机域名、后缀(顶级域名) 输入www.baidu.com,输出'www', 'baidu', 'com' 输入172.31.137.240,输出'', '172.31.137.240', '' ''' PSL_FILE = codecs.open('public_suffix_list.dat', encoding='utf8') psl = PublicSuffixList(PSL_FILE) domain = psl.get_public_suffix(server_domain) # 取域名的第一个字段,即第一个'.'之前的为主机域名, 后面为顶级域名,前面为所使用的服务 if '.' in domain: server = server_domain[:-len(domain)] host = domain[:domain.index('.')] top = domain[domain.index('.'):] hostname = server + host + top else: # 说明提取域名失败,例如172.31.137.240等IP形式,此时全部当作主机域名 server = '' host = server_domain top = '' hostname = server_domain return server, host, top, hostname
def load_suffix_list(): if SUFFIX_CACHE and os.path.exists(SUFFIX_CACHE): utils.debug("Using cached suffix list.", divider=True) cache_file = codecs.open(SUFFIX_CACHE, encoding='utf-8') suffixes = PublicSuffixList(cache_file) else: # File does not exist, download current list and cache it at given location. utils.debug("Downloading the Public Suffix List...", divider=True) try: cache_file = fetch() except URLError as err: logging.warn("Unable to download the Public Suffix List...") utils.debug("{}".format(err)) return [] content = cache_file.readlines() suffixes = PublicSuffixList(content) if SUFFIX_CACHE: utils.debug("Caching suffix list at %s" % SUFFIX_CACHE, divider=True) utils.write(''.join(content), SUFFIX_CACHE) return suffixes
class CommentsPortalUrls(NodesEdgesCreator): def __init__(self): self.psl = PublicSuffixList() def create(self, articles): articles = filter(lambda a: len(a['comments']) > 0, articles) domains = [] for article in articles: for comment in article['comments']: user = comment['author'] urls = self.__extract_urls_from_comment(comment) extracted_domains = self.__convert_urls_to_domains(urls) for domain in extracted_domains: ds = filter(lambda d: d.url == domain, domains) if len(ds) > 0: d = ds[0] else : d = Domain(domain) domains.append(d) d.add_user(user) nodes = self.__create_nodes(domains) return nodes, domains def __create_nodes(self, domains): nodes = [] for domain in domains: nodes.append(domain) for user in domain.edges: if user not in nodes: nodes.append(user) return nodes def __extract_urls_from_comment(self, comment): html = comment['htmlContent'] matches = re.findall(r'<a href="(.*?)" target="_blank" rel="nofollow">\1</a>', html) return matches def __extract_domain(self, url): url = url.lower() matches = re.search(r'(h|Ht|Tt|Tp|Ps|S)?:[/\\]{2}([wW]{3}\.)?([^:/\\]+)', url) match = matches.group(3) domain = self.psl.get_public_suffix(match) return domain def __convert_urls_to_domains(self, urls): return map(lambda u: self.__extract_domain(u), urls)
def run(fname=None, iface=None, log_by_ip=False, launch_ff=False, sslstriplog=None, sslsplitdir=None): global psl global ip_logging psl = PublicSuffixList() ip_logging = log_by_ip if (not fname and not iface and not launch_ff): print parser.print_help() exit(-1) if (launch_ff): if (sslsplitdir): parsesslsplit(sslsplitdir) launch_firefox() else: if not os.path.exists(save_dir): os.makedirs(save_dir) print "MANA (FireLamb) : [+] Saving output to %s" % save_dir if (iface): print "MANA (FireLamb) : [+] Listening for cookie traffic on interface %s" % iface sniff(iface=iface, prn=process) elif (fname): print "MANA (FireLamb) : [+] Reading pcap file '%s'...." % fname packets = rdpcap(fname) print "MANA (FireLamb) : [+] Processing file contents..." for p in packets: process(p) print "MANA (FireLamb) : [+] Done."
def alexa_malware_scan(url): domain = PublicSuffixList().get_public_suffix( urlparse(url).netloc) # IRIs are going to be a pain here. pipe = redis_db["slave"].pipeline() pipe.hlen(domains_key) pipe.hmget(domains_key, domain) total, score = pipe.execute() score = score[0] def rank_to_ratio(score, total): """ if the score is between 1 and 1 million never return 1 If the score is none return 1 """ if score is not None: score = int(score) - 1 total = total return score / total else: return 1 return [{"type": "generic", "confidence": rank_to_ratio(score, total)}]
def __init__(self, pipes=None, autoreload=False, frequency=600, aliases=ATTRS, cache_enabled=True, hosts_dir=None, observers=[]): if pipes is None: pipes = [] self.cache_enabled = cache_enabled self._md = None self.lock = ReadWriteLock() self.plumbings = [plumbing(v) for v in pipes] self.refresh = MDUpdate(cherrypy.engine, server=self, frequency=frequency) self.refresh.subscribe() self.aliases = aliases self.observers = observers self.psl = PublicSuffixList() if autoreload: for f in pipes: cherrypy.engine.autoreload.files.add(f)
class UrlChecker(object): psl = None #public suffix list object netloc_cache = {} def __init__(self): self.psl = PublicSuffixList() def is_valid_url(self, url): isUrlOk = False if len(url): #parse url u = urlparse.urlparse(url) #add scheme if missing if u.scheme == u'': url = u'http://' + url u = urlparse.urlparse(url) #get netloc netloc = u.netloc if self.netloc_cache.has_key(netloc): isUrlOk = self.netloc_cache[netloc] else: query_domain = self.psl.get_public_suffix(netloc) #check if it is google parts = query_domain.split(u'.') if len(parts) > 0 and parts[0].upper() == u'GOOGLE': isUrlOk = True self.netloc_cache[netloc] = isUrlOk return isUrlOk
def _parse_html(self,html): """parse html; return tuple (array html data, int res_stats, ); extracts result stats, title, url, content, time of content, related links""" bs = BeautifulSoup(html) res_docs = [] #get google estimation on number of results tot_res_el = bs.find('div',id = 'resultStats') tot_res = 0 if tot_res_el: non_decimal = re.compile(r'[^\d]+') try: tot_res = int(non_decimal.sub('',tot_res_el.text)) except Exception: pass #instance of Suffix list to extract top level domain from url psl = PublicSuffixList() #get li objects with main search results lis = bs.find_all('li','g') url_search = re.compile(r'\?q=.*sa=') type_search = re.compile(r'(\/url)|(\/images)|(\/search)') for li in lis: #check url type of title h3 = li.select('h3 a') container_type = '' if len(h3) > 0: href = h3[0].get('href') container_type = type_search.match(href) if container_type: container_type = container_type.group().lower() container_type = container_type[1:] #remove leading slash if container_type == u'url': doc = dict() #get title ttl_el = li.find(u'',u'r') ttl = ttl_el.text #get title url ttl_url = ttl_el.find('a').get('href') #clean url m = url_search.search(ttl_url) if m: ttl_url = m.group(0)[3:-4] #3 for ?q=, 4 for &sa= ttl_url = urllib.unquote(ttl_url) #ttl_url = urllib.unquote(ttl_url) #get tld tld = u'' try: u = urlparse.urlparse(ttl_url) tld = psl.get_public_suffix(u.netloc) except Exception: pass #get content cnt_el = None cnt_els = li.select('div.s span.st') cnt = u'' if len(cnt_els) > 0: cnt_el = cnt_els[0] if cnt_el: cnt = cnt_el.text #get time of content cnt_time = u'' if cnt_el: #get first <b> element and check its value b = cnt_el.find('b') if b and b.text == u'...': #get time cnt_time = cnt_el.strings.next() if len(cnt_time) > 25: cnt_time = u'' #get related links rel_links = [] rel_links_els = li.select('div.s div.osl a') for rel_link_el in rel_links_els: rel_link = rel_link_el.get('href') #clean url m = url_search.search(rel_link) if m: rel_link = m.group(0)[3:-4] rel_link = urllib.unquote(rel_link) #rel_link = urllib.unquote(rel_link) #add url to array rel_links.append(rel_link) #insert elements into doc doc['title'] = ttl doc['title_url'] = ttl_url doc['tld'] = tld doc['content'] = cnt doc['content_time'] = cnt_time doc['related_links'] = rel_links doc['type'] = container_type #add to docs array res_docs.append(doc) else: #here go images, vides, news and other pass #return tuple return (res_docs, tot_res, )
def scrape(self, url, return_data): current_task = self.current_task logger = self.logger url = url.encode('utf-8') #change status current_task.update_state(state=u'STARTED', meta={'url': url, 'group': self.group_name}) logger.info('TASK EXECUTING: %r, args: %r kwargs: %r' % ( self.current_task.request.id, self.current_task.request.args, self.current_task.request.kwargs)) #avoid doing anything if url is empty if len(url) == 0: self._wrong_param_exception(url) #parse url u = urlparse.urlparse(url) #add scheme if missing if u.scheme == '': url = 'http://' + url u = urlparse.urlparse(url) #get netloc netloc = u.netloc #get parsed query qs = urlparse.parse_qs(u.query) start = (qs['start'] if 'start' in qs else ['0'])[0] start = int(start) #convert qs elements from array to normal strings for k in qs.keys(): el = qs[k] el = ' '.join(el) #try to convert number strings into numbers new_k = k.lower() if new_k == 'num' or new_k == 'start': el = el.replace(' ', '') try: el = int(el) except Exception: pass qs[k] = el #add default values for num and start if there are none if not 'num' in qs: qs['num'] = 10 if not 'start' in qs: qs['start'] = 0 #get domain name of the query psl = PublicSuffixList() query_domain = psl.get_public_suffix(netloc) #check if it is google parts = query_domain.split(u'.') scraped_docs = '' if len(parts) > 0 and parts[0].upper() == u'GOOGLE': current_task.update_state(state=u'CONNECTING', meta={'url': url, 'group': self.group_name}) #create request req = { 'url': urllib.quote_plus(url, "%/:=&?~#+!$,;'@()*[]"), 'referer': u'http://google.com', 'useragent': u'Webscraper/' + CustomAppSettings.get_version() + ' (+http://www.yriver.it/webscraper/)',#'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.47 Safari/537.36', 'region': u'gb', 'priority': u'1' } #define which scrape key to take scrape_key = get_scrape_key_special if self.plan['is_special'] else get_scrape_key #make query query = {'key': scrape_key(), 'request': req} p = ProxyConnection(self.user, self.plan) html = p.send_request(query) #parse html scraped_docs = '' if len(html) > 0 and html != '0': scraped_docs, tot_res_est = self._parse_html(html) #write into db self._db_write_res(task_id=current_task.request.id, url=url, group_name=self.group_name, results=scraped_docs, tot_res=tot_res_est, start=start, query=qs, domain=query_domain) # Convert to Base64 if return_data = True if return_data: encoded_result = base64.standard_b64encode(json.dumps(scraped_docs)) return {'url': url, 'group_name': self.group_name, 'domain': query_domain, 'b64_json': encoded_result} else: return {'url': url, 'group_name': self.group_name}
def __init__(self): self.psl = PublicSuffixList()
class MDServer(object): """The MDServer class is the business logic of pyFF. This class is isolated from the request-decoding logic of MDRoot and from the ancilliary classes like MDStats and WellKnown. """ def __init__(self, pipes=None, observers=None): if not observers: observers = [] if not pipes: pipes = [] self._pipes = pipes self.lock = ReadWriteLock() self.plumbings = [plumbing(v) for v in pipes] self.refresh = MDUpdate(cherrypy.engine, server=self, frequency=config.update_frequency) self.refresh.subscribe() self.aliases = config.aliases self.psl = PublicSuffixList() self.md = MDRepository() self.ready = False if config.autoreload: for f in pipes: cherrypy.engine.autoreload.files.add(f) def reload_pipeline(self): new_plumbings = [plumbing(v) for v in self._pipes] self.plumbings = new_plumbings class MediaAccept(object): def __init__(self): pass def has_key(self, key): return True def get(self, item): return self.__getitem__(item) def __getitem__(self, item): try: return cptools.accept(item, debug=True) except HTTPError: return False def request(self, **kwargs): """The main request processor. This code implements all rendering of metadata. """ if not self.ready: raise HTTPError(503, _("Service Unavailable (repository loading)")) pfx = kwargs.get('pfx', None) path = kwargs.get('path', None) content_type = kwargs.get('content_type', None) request_type = kwargs.get('request_type', "negotiate") log.debug("MDServer pfx=%s, path=%s, content_type=%s" % (pfx, path, content_type)) def _d(x, do_split=True): dot = six.u('.') if x is not None: x = x.strip() # log.debug("_d(%s,%s)" % (x, do_split)) if x is None or len(x) == 0: return None, None if x.startswith("{base64}"): x = safe_b64d(x[8:]) if isinstance(x, six.binary_type): x = x.decode() if do_split and dot in x: (pth, _, extn) = x.rpartition(dot) if extn in _ctypes: return pth, extn return x, None _ctypes = {'xml': 'application/xml', 'json': 'application/json', 'htm': 'text/html', 'html': 'text/html', 'ds': 'text/html', 's': 'application/json'} alias = None if pfx: alias = pfx pfx = self.aliases.get(alias, None) if pfx is None: raise NotFound() path, ext = _d(path, content_type is None) if pfx and path: q = "{%s}%s" % (pfx, path) path = "/%s/%s" % (alias, path) else: q = path if ext is not None: log.debug("request path: %s.%s, headers: %s" % (path, ext, cherrypy.request.headers)) else: log.debug("request path: %s, headers: %s" % (path, cherrypy.request.headers)) accept = {} if content_type is None: if ext is not None and ext in _ctypes: accept = {_ctypes[ext]: True} else: accept = MDServer.MediaAccept() if ext is not None: path = "%s.%s" % (path, ext) else: accept = {content_type: True} with self.lock.readlock: if ext == 'ds': pdict = dict() entity_id = kwargs.get('entityID', None) if entity_id is None: raise HTTPError(400, _("400 Bad Request - missing entityID")) e = self.md.store.lookup(entity_id) if e is None or len(e) == 0: raise HTTPError(404) if len(e) > 1: raise HTTPError(400, _("Bad Request - multiple matches for") + " %s" % entity_id) pdict['entity'] = entity_simple_summary(e[0]) if not path: pdict['search'] = "/search/" pdict['list'] = "/role/idp.json" else: pdict['search'] = "{}.s".format(escape(path, quote=True)) pdict['list'] = "{}.json".format(escape(path, quote=True)) pdict['storage'] = "/storage/" cherrypy.response.headers['Content-Type'] = 'text/html' return render_template(config.ds_template, **pdict) elif ext == 's': paged = bool(kwargs.get('paged', False)) query = kwargs.get('query', None) page = kwargs.get('page', 0) page_limit = kwargs.get('page_limit', 10) entity_filter = kwargs.get('entity_filter', None) related = kwargs.get('related', None) cherrypy.response.headers['Content-Type'] = 'application/json' cherrypy.response.headers['Access-Control-Allow-Origin'] = '*' if query is None: log.debug("empty query - creating one") query = [cherrypy.request.remote.ip] referrer = cherrypy.request.headers.get('referrer', None) if referrer is not None: log.debug("including referrer: %s" % referrer) url = urlparse(referrer) host = url.netloc if ':' in url.netloc: (host, port) = url.netloc.split(':') for host_part in host.rstrip(self.psl.get_public_suffix(host)).split('.'): if host_part is not None and len(host_part) > 0: query.append(host_part) log.debug("created query: %s" % ",".join(query)) if paged: res, more, total = self.md.store.search(query, path=q, page=int(page), page_limit=int(page_limit), entity_filter=entity_filter, related=related) # log.debug(dumps({'entities': res, 'more': more, 'total': total})) return dumps({'entities': res, 'more': more, 'total': total}) else: return dumps(self.md.store.search(query, path=q, entity_filter=entity_filter, related=related)) elif accept.get('text/html'): if not q: if pfx: title = pfx else: title = _("Metadata By Attributes") return render_template("index.html", md=self.md, samlmd=samlmd, alias=alias, aliases=self.aliases, title=title) else: entities = self.md.lookup(q) if not entities: raise NotFound() if len(entities) > 1: return render_template("metadata.html", md=self.md, samlmd=samlmd, subheading=q, entities=entities) else: entity = entities[0] return render_template("entity.html", headline=entity_display_name(entity), subheading=entity.get('entityID'), entity_id=entity.get('entityID'), samlmd=samlmd, entity=entity_info(entity)) else: for p in self.plumbings: state = {'request': request_type, 'headers': {'Content-Type': 'text/xml'}, 'accept': accept, 'url': cherrypy.url(relative=False), 'select': q, 'path': path, 'stats': {}} r = p.process(self.md, state=state) if r is not None: cache_ttl = state.get('cache', 0) log.debug("caching for %d seconds" % cache_ttl) for k, v in list(state.get('headers', {}).items()): cherrypy.response.headers[k] = v cherrypy.response.headers['Access-Control-Allow-Origin'] = '*' caching.expires(secs=cache_ttl) return r raise NotFound()
#!/usr/bin/python #encoding:utf-8 from publicsuffix import PublicSuffixList domainParser = PublicSuffixList() # print domainParser.get_public_suffix("www.example.com.cn") # print domainParser.get_public_suffix("www.example.com.uk") # print domainParser.get_public_suffix("jaysonhwang.sinaapp.com") # print domainParser.get_public_suffix("1.jaysonhwang.sinaapp.com") # print domainParser.get_public_suffix("jaysonhwang.sinaapp.com/web/1") print domainParser.get_domain("http://192.168.0.100:8080/web") print domainParser.get_domain("http://www.qq.com") allow = [ "http://www.people.com.cn", "http://www.xinhuanet.com", "http://www.qq.com", "http://www.163.com", "http://www.cntv.cn", "http://www.ifeng.com", "http://www.hexun.com", "http://www.sina.com.cn", "http://www.sohu.com", "http://www.dbw.cn",] for a in allow: print domainParser.get_domain(a)[0]
# -*- coding: utf-8 -*- """ Created on Fri Jun 27 11:57:34 2014 @author: Reed """ #script to find pathological URLs in URL list ##import re from publicsuffix import PublicSuffixList from urllib.parse import urlparse import os import re os.chdir("C://Users//Reed//Documents//Github//wikiplus1") psl=PublicSuffixList() with open("dat_wiki_urls.txt", 'r', encoding='utf-8') as infile: weirdurls_reg=[] weirdurls_arc=[] for line in infile: ##if len(re.findall("^//|^http://|^https://",line)) ==0: ##weirdurls.append(line) #check if potential web archive if re.search("//",line): slashsplit=line.split("http://") if len(slashsplit)<=2: #REMOVE PORT, ADD THIS TO SCRIPT PIPELINE dom=urlparse(line).netloc.split(':')[0] archive='NULL' tldn=psl.get_public_suffix(dom.rstrip()) try:
class MDServer(object): """The MDServer class is the business logic of pyFF. This class is isolated from the request-decoding logic of MDRoot and from the ancilliary classes like MDStats and WellKnown. """ def __init__(self, pipes=None, observers=None): if not observers: observers = [] if not pipes: pipes = [] self._pipes = pipes self.lock = ReadWriteLock() self.plumbings = [plumbing(v) for v in pipes] self.refresh = MDUpdate(cherrypy.engine, server=self, frequency=config.frequency) self.refresh.subscribe() self.aliases = config.aliases self.psl = PublicSuffixList() self.md = MDRepository(metadata_cache_enabled=config.caching_enabled, store=config.store) if config.autoreload: for f in pipes: cherrypy.engine.autoreload.files.add(f) @property def ready(self): return self.md.store.ready() def reload_pipeline(self): new_plumbings = [plumbing(v) for v in self._pipes] self.plumbings = new_plumbings class MediaAccept(object): def __init__(self): pass def has_key(self, key): return True def get(self, item): return self.__getitem__(item) def __getitem__(self, item): try: return cptools.accept(item, debug=True) except HTTPError: return False def request(self, **kwargs): """The main request processor. This code implements all rendering of metadata. """ stats['MD Requests'] += 1 if not self.ready: raise HTTPError(503, _("Service Unavailable (repository loading)")) pfx = kwargs.get('pfx', None) path = kwargs.get('path', None) content_type = kwargs.get('content_type', None) log.debug("MDServer pfx=%s, path=%s, content_type=%s" % (pfx, path, content_type)) def _d(x, do_split=True): if x is not None: x = x.strip() log.debug("_d(%s,%s)" % (x, do_split)) if x is None or len(x) == 0: return None, None if x.startswith("{base64}"): x = x[8:].decode('base64') if do_split and '.' in x: (pth, dot, extn) = x.rpartition('.') assert (dot == '.') if extn in _ctypes: return pth, extn return x, None _ctypes = {'xml': 'application/xml', 'json': 'application/json', 'htm': 'text/html', 'html': 'text/html', 'ds': 'text/html', 's': 'application/json'} alias = None if pfx: alias = pfx pfx = self.aliases.get(alias, None) if pfx is None: raise NotFound() path, ext = _d(path, content_type is None) if pfx and path: q = "{%s}%s" % (pfx, path) path = "/%s/%s" % (alias, path) else: q = path if ext is not None: log.debug("request path: %s.%s, headers: %s" % (path, ext, cherrypy.request.headers)) else: log.debug("request path: %s, headers: %s" % (path, cherrypy.request.headers)) accept = {} if content_type is None: if ext is not None and ext in _ctypes: accept = {_ctypes[ext]: True} else: accept = MDServer.MediaAccept() if ext is not None: path = "%s.%s" % (path, ext) else: accept = {content_type: True} with self.lock.readlock: if ext == 'ds': pdict = dict() entity_id = kwargs.get('entityID', None) if entity_id is None: raise HTTPError(400, _("400 Bad Request - missing entityID")) pdict['sp'] = self.md.sha1_id(entity_id) e = self.md.store.lookup(entity_id) if e is None or len(e) == 0: raise HTTPError(404) if len(e) > 1: raise HTTPError(400, _("400 Bad Request - multiple matches for") + " %s" % entity_id) pdict['entity'] = self.md.simple_summary(e[0]) if not path: pdict['search'] = "/search/" pdict['list'] = "/role/idp.json" else: pdict['search'] = "%s.s" % path pdict['list'] = "%s.json" % path cherrypy.response.headers['Content-Type'] = 'text/html' return render_template("ds.html", **pdict) elif ext == 's': paged = bool(kwargs.get('paged', False)) query = kwargs.get('query', None) page = kwargs.get('page', 0) page_limit = kwargs.get('page_limit', 10) entity_filter = kwargs.get('entity_filter', None) related = kwargs.get('related', None) cherrypy.response.headers['Content-Type'] = 'application/json' if query is None: log.debug("empty query - creating one") query = [cherrypy.request.remote.ip] referrer = cherrypy.request.headers.get('referrer', None) if referrer is not None: log.debug("including referrer: %s" % referrer) url = urlparse.urlparse(referrer) host = url.netloc if ':' in url.netloc: (host, port) = url.netloc.split(':') for host_part in host.rstrip(self.psl.get_public_suffix(host)).split('.'): if host_part is not None and len(host_part) > 0: query.append(host_part) log.debug("created query: %s" % ",".join(query)) if paged: res, more, total = self.md.search(query, path=q, page=int(page), page_limit=int(page_limit), entity_filter=entity_filter, related=related) # log.debug(dumps({'entities': res, 'more': more, 'total': total})) return dumps({'entities': res, 'more': more, 'total': total}) else: return dumps(self.md.search(query, path=q, entity_filter=entity_filter, related=related)) elif accept.get('text/html'): if not q: if pfx: title = pfx else: title = _("Metadata By Attributes") return render_template("index.html", md=self.md, alias=alias, aliases=self.aliases, title=title) else: entities = self.md.lookup(q) if not entities: raise NotFound() if len(entities) > 1: return render_template("metadata.html", md=self.md, subheading=q, entities=entities) else: entity = entities[0] t = html.fragment_fromstring(unicode(xslt_transform(entity, "entity2html.xsl"))) for c_elt in t.findall(".//code[@role='entity']"): c_txt = dumptree(entity) parser = etree.XMLParser(remove_blank_text=True) src = StringIO(c_txt) tree = etree.parse(src, parser) c_txt = dumptree(tree, pretty_print=True, xml_declaration=False).decode("utf-8") p = c_elt.getparent() p.remove(c_elt) if p.text is not None: p.text += c_txt else: p.text = c_txt xml = dumptree(t, xml_declaration=False).decode('utf-8') return render_template("entity.html", headline=self.md.display(entity).strip(), subheading=entity.get('entityID'), entity_id=entity.get('entityID'), content=xml) else: for p in self.plumbings: state = {'request': True, 'headers': {'Content-Type': 'text/xml'}, 'accept': accept, 'url': cherrypy.url(relative=False), 'select': q, 'path': path, 'stats': {}} r = p.process(self.md, state=state) if r is not None: cache_ttl = state.get('cache', 0) log.debug("caching for %d seconds" % cache_ttl) for k, v in state.get('headers', {}).iteritems(): cherrypy.response.headers[k] = v caching.expires(secs=cache_ttl) return r raise NotFound()
code=[] country=[] with open("countrycodenamelist.txt","r") as ccfile: for line in ccfile: byct=line.split("\t") code.append(byct[0]) country.append(byct[1].rstrip()[2:]) return [code,country] #get parellel lists of country code top level domain and cclist=countrysufflist() #initialize lists psl=PublicSuffixList() tldns=[] #domain full doms=[] #domain base suffs=[] # suffix ct_codes=[] #last piece of TLD, potentially country code ct_names=[] #name if valid urlcount=[] #count of urls majorDoms=[] #.com, .edu, .gov, .org (even with additional suffix), else NULL error=[] with open("dom_count_full.txt","rb") as funfile: for line in funfile: line_list=line.decode('utf-8').split('\t') tldn=psl.get_public_suffix(line_list[0].rstrip()) count=int(line_list[1]) #split domain
class FeedFilter: """ Feedfilter takes in the arguments from the command line, processes them, and passes them out to an appropriate filter. """ def __init__(self, **kwargs): """ args - passed in by optparse """ self.cc_filters = [] self.asn_filters = [] self.format = None self.has_header = False self.infile = None self.outfile = None self.verbose = False self.quiet = False self.matchers = {} self.repo = NetObjectRepo() # regexs are intentionally broad - we'll run more tests later. self.matchers["ip"] = {"rex": "(?:\d+\.){3}\d+", "chk_func": self._is_valid_ip, "type": "ip"} self.matchers["hostname"] = { "rex": "([a-zA-Z0-9\-\.]+\.[0-9a-zA-Z\-\.]+)(?:\d+)?", "chk_func": self._is_valid_domain, "type": "domain", } logging.info("Getting Public Suffix List") self.psl = PublicSuffixList(self._get_psl_file()) logging.info("Got Public Suffix List") self.parse_args(**kwargs) def _get_psl_file(self): """ returns Public Suffix List as a list of lines in the PSL """ url = "http://mxr.mozilla.org/mozilla-central/source/netwerk/dns/effective_tld_names.dat?raw=1" headers = {"cache-control": "max-age=%d" % (60 * 60 * 24 * 7)} http = httplib2.Http(tempfile.gettempdir()) response, content = http.request(url, headers=headers) return content.split("\n") def parse_args( self, infile=sys.stdin, outfile=sys.stdout, verbose=False, verboser=False, quiet=False, has_header=False, format=None, filter=None, ): def create_stdin_temp_file(): f = tempfile.NamedTemporaryFile() for line in sys.stdin.read(): f.write(line) # TODO: according to docs, a second open won't work on Win return open(f.name, "r") self.outfile = outfile self.verbose = verbose self.quiet = quiet self.has_header = has_header level = logging.WARN # quiet overrides everything else if verbose: level = logging.INFO if verboser: level = logging.DEBUG if quiet: level = logging.ERROR logging.basicConfig(level=level, format="%(message)s") if not infile or infile.name == "<stdin>": self.infile = create_stdin_temp_file() else: self.infile = infile for filt in filter.split(","): for m in re.findall("^(?:AS)?(\d+)$", filt): self.asn_filters.append(m.upper()) for m in re.findall("^[A-Za-z]+$", filt): self.cc_filters.append(m.upper()) if len(self.asn_filters) == 0 and len(self.cc_filters) == 0: # raise ValueError, "You need to specify at least one valid TLD or ASN filter. e.g. AS254,JP,AU" sys.exit("You need to specify --filter with at least one valid TLD or ASN filter. e.g. AS254,JP,AU") logging.info("Using filters: ") if self.asn_filters: logging.info(" ASN: %s" % (", ".join(self.asn_filters))) if self.cc_filters: logging.info(" Country codes: %s" % (", ".join(self.cc_filters))) def domains_to_ips(self): ar = AsyncResolver([domain_data["domain"] for domain_data in self.repo.get_domain_data()]) resolved = ar.resolve() for host, ips in resolved.items(): if ips is None: logging.debug("%s could not be resolved." % host) else: self.repo.add_domain_ips(host, ips) def extract_matches(self): self.infile.seek(0) for linenum, line in enumerate(self.infile.readlines()): # no need to parse a header line if self.has_header and linenum == 0: pass for (match_type, match) in self.get_line_matches(line, linenum): # self.repo.add(match_type, match) yield (match_type, match) def extract_and_store_matches(self): for match_type, match in self.extract_matches(): self.repo.add(match_type, match) def get_filtered_lines(self): self.infile.seek(0) for linenum, line in enumerate(self.infile.readlines()): if self.has_header and linenum == 0: yield (line) else: for match_type, match in self.get_line_matches(line, linenum): if self.repo.belongs_to( datatype=match_type, data=match, asn_filters=self.asn_filters, cc_filters=self.cc_filters ): yield (line) logging.debug("'%s' matches filter %s", match, match_type) break def output_matches(self): for line in self.get_filtered_lines(): self.outfile.write(line) def get_line_matches(self, line, line_num, fetch_only_one=False): try: match = False for m_key, m_dict in self.matchers.items(): if "chk_func" in m_dict and "rex" in m_dict: for m in re.findall(m_dict["rex"], line): if m_dict["chk_func"](m): match = True logging.debug("matched '%s' as %s" % (m, m_key)) yield ((m_dict["type"], m)) if match and fetch_only_one: break elif "chk_func" in m_dict and m_dict["chk_func"](line): match = True yield ((m_dict["type"], line)) elif "rex" in m_dict: for m in re.findall(m_dict["rex"], line): match = True yield ((m_dict["type"], m)) if match and fetch_only_one: break except csv.Error: logging.warn("Error parsing line %d, skipping" % line_num) def _is_valid_domain(self, domain): if not str(domain): return None # don't want / need to resolve IPs elif self._is_valid_ip(domain): return None else: # using this PSL, known TLDs return at least one . return self.get_tld(domain).find(".") >= 0 def get_tld(self, domain): suffix = self.psl.get_public_suffix(domain) logging.debug("Domain fetched: %s", suffix) return suffix def _is_valid_ip(self, ip): for family in (socket.AF_INET, socket.AF_INET6): try: socket.inet_pton(family, ip) except Exception: pass else: return True return False def add_asn_cc_info(self): def asn_lookup(): bw = BulkWhoisCymru() ip_list = [] for ip_data in self.repo.get_ip_data(): ip_list.append(str(ip_data["ip"])) return bw.lookup_ips(ip_list) asn_info = asn_lookup() for ip_data in self.repo.get_ip_data(): if ip_data["ip"] in asn_info: ip = ip_data["ip"] self.repo.add_ip_asn_cc(ip, asn=asn_info[ip]["asn"], cc=asn_info[ip]["cc"]) def add_domain_ccs(self): for domain_data in self.repo.get_domain_data(): tld = self.get_tld(domain_data["domain"]) if tld: self.repo.add_domain_cc(domain_data["domain"], cc=(tld.split(".")[-1])) def process_file(self): stime = time.time() logging.info("Extracting matches") self.extract_and_store_matches() logging.debug("Got matches " + str(time.time() - stime)) if self.repo.get_domain_count() > 0: logging.info("Resolving " + str(self.repo.get_domain_count()) + " unique domains") self.domains_to_ips() logging.debug("Resolved IPs " + str(time.time() - stime)) logging.info("Looking up ASNs") if self.repo.get_ip_count() > 0: self.add_asn_cc_info() logging.debug("Got asns " + str(time.time() - stime)) logging.info("Getting domain CCs") self.add_domain_ccs() logging.debug("Added domain ccs " + str(time.time() - stime)) self.repo.dump()
class Worker(object): def __init__(self, whois_sleep_seconds=10, nameservers=['8.8.8.8', '8.8.4.4'], log_filename=None): ''' initialize a worker object to do work ''' self.psl = PublicSuffixList() self.whois_sleep_seconds = whois_sleep_seconds self.my_resolver = dns.resolver.Resolver() self.my_resolver.nameservers = nameservers with open(os.path.join(os.path.dirname(__file__), '../aux/whois_server_ips')) as f: self.whois_server_ips = json.load(f) self.logger = logging.getLogger('snapshooter_worker') self.logger.setLevel(logging.DEBUG) ch = logging.StreamHandler() self.logger.addHandler(ch) if log_filename: fh = logging.FileHandler(log_filename) self.logger.addHandler(fh) def _datetime_list_to_str(self, dt): ''' convert a list of datetimes to a list of strings ''' return [str(each) for each in dt] def _datetime_to_str(self, dt): ''' convert a datetimes to a string ''' return str(dt) def _log_this(self, domain, msg): ''' log a message ''' self.logger.debug("%s\t%s\t%s\t%s" % (self.get_now(), "snapshooter_worker", domain, msg)) def get_now(self): ''' get the time right now, a good candidate for testing RPC ''' return str(datetime.datetime.now()).replace(' ', '_') def get_whois(self, domain, tld, retries=3, queried_servers=set(), remaining_servers=set()): ''' given a domain, find whois information in a semi-intelligent way using the TLD to server IP list in aux, rotate between whois server IPs authoritative for the TLD if all the IPs are throttling us, sleep and try again (sleeping decrements retries) ''' self._log_this(domain, 'received whois call') # base case if retries < 1: self._log_this(domain, 'whois failed all time, bailing') return {} tld = '.' + tld.strip('.') # we know a set of IPs responsible for whois info for this tld, so we try to rotate between them if tld in self.whois_server_ips: self._log_this(domain, 'tld found in whois_server_ips') # this is the first iteration if len(queried_servers) == 0 and len(remaining_servers) == 0: remaining_servers.update([ip for hostname in self.whois_server_ips[tld] for ip in self.whois_server_ips[tld][hostname]]) self._log_this(domain, 'iterating over the following whois servers: %s' % (remaining_servers)) # we've queried all the servers we can and now need to try sleeping if len(remaining_servers) == 0 and len(queried_servers) > 0: self._log_this(domain, 'querying whois with no specified server') try: w = pythonwhois.get_whois(domain) except: sys.stderr.write('domain: %s whois returned no results retries remaining: %d\n' % (domain, retries)) time.sleep(self.whois_sleep_seconds) return self.get_whois(domain, tld, retries=retries-1) # remaining servers exist, let's try querying them before trying sleep else: server = random.sample(remaining_servers, 1)[0] queried_servers.add(server) remaining_servers.remove(server) self._log_this(domain, 'querying whois with specific server: %s' % (server)) try: w = pythonwhois.parse.parse_raw_whois(pythonwhois.net.get_whois_raw(domain, server=server)) except: sys.stderr.write('domain: %s whois returned no results from server: %s, retries remaining: %d\n' % (domain, server, retries)) # NO SLEEP return self.get_whois(domain, tld, retries=retries, remaining_servers=remaining_servers, queried_servers=queried_servers) # the tld is not in our whois server list and we must use sleep to avoid being throttled else: self._log_this(domain, 'querying whois with no specified server') try: w = pythonwhois.get_whois(domain) except: sys.stderr.write('domain: %s whois returned no results retries remaining: %d\n' % (domain, retries)) time.sleep(self.whois_sleep_seconds) return self.get_whois(domain, tld, retries=retries-1) # once we have a response... # messagepack (used by zerorpc) can't serialize datetime objects, so we make them strings :\ for date in ('expiration_date', 'creation_date', 'updated_date', 'changedate'): if date in w: w[date] = self._datetime_list_to_str(w[date]) for category in ('registrant', 'tech', 'billing', 'admin'): if ('contacts' in w) and (category in w['contacts']) and (w['contacts'][category] is not None) and (date in w['contacts'][category]): w['contacts'][category][date] = self._datetime_to_str(w['contacts'][category][date]) return w def get_asn(self, ip, domain): ''' given an IP address as a string, return Team Cymru ASN lookup results radata example: "33667 | 98.239.64.0/18 | US | arin | 2007-04-20" ''' self._log_this(domain, 'received asn call %s' % (ip)) return None # # This is disabled to avoid hammering TC's IP to ASN service. # Enable at your own discretion. # ##################### #try: # ans = self.my_resolver.query(str(ip) + ".origin.asn.cymru.com", "TXT") # for rdata in ans: # result = [str(rdata).strip().strip('"') for rdata in str(rdata).split('|')] # return result #except: # return None ##################### def get_ipv4s(self, domain): ''' given a domain, return a set of IPv4 information - ip2asn is used to reduce lookups to get_asn for the same IP - all the for loops and DNS queries might be confusing, so let's explain it: 1. have 8.8.8.8 locate the NS records for the name we are interested in (find the authoritative name servers) 2. locate the A records for those NS records (find the IPs the authoritative domains resolve to) 3. send queries to those name server IPs for the A records of the original name (query the authoritative systems for the names' IP addresses) - this is similar to a 'dig +trace [name]' command, but lets Google recurse for the answer a bit and doesn't go directly to the roots - doing it this way give us the authoritative TTL for the name - there is risk here of an attacker controlled name server providing incorrect or poisoned responses to workers ''' self._log_this(domain, 'received ipv4s call %s') ip2asn = {} ipv4s = [] self.my_resolver.nameservers = ['8.8.8.8', '8.8.4.4'] try: ns_names = self.my_resolver.query(domain, "NS") for ns_name in ns_names.rrset: self._log_this(domain, 'NS record: %s found' % (ns_name)) ns_ips = self.my_resolver.query(str(ns_name), "A") for ns_ip in ns_ips.rrset: self._log_this(domain, 'A record: %s, found for name %s' % (ns_ip, ns_name)) self.my_resolver.nameservers = [str(ns_ip)] ans = self.my_resolver.query(domain, "A") for rdata in ans.rrset: ip_address = str(rdata) self._log_this(domain, 'A record: %s found' % (ip_address)) ip_ttl = ans.rrset.ttl try: if ip_address in ip2asn.keys(): self._log_this(domain, 'using cached results for IP: %s' % (ip_address)) asn, ip_prefix, _, _, _ = ip2asn[ip_address] ip_asn = int(asn) else: self._log_this(domain, 'attempting ASN lookup for IP: %s' % (ip_address)) asn, ip_prefix, _, _, _ = self.get_asn(domain, ip_address) ip2asn[ip_address] = asn, ip_prefix, '', '', '' ip_asn = int(asn) except: self._log_this(domain, 'failed lookup for IP: %s' % (ip_address)) ip_prefix = '' ip_country = '' ip_asn = 0 ip2asn[ip_address] = ip_asn, ip_prefix, ip_country, '', '' ipv4s.append({'ip_address': str(ip_address), 'ip_ttl': ip_ttl, 'ns_name': str(ns_name), 'ns_ip': str(ns_ip), 'asn': ip_asn, 'ip_prefix': ip_prefix, 'ip_country': ip_country}) except: pass return ipv4s def get_authoritative_domains(self, domain, nameservers): ''' given a domain, return information about the domain's authoritative domain ''' self._log_this(domain, 'received auth call') auths = list() for server in nameservers: d = self.psl.get_public_suffix(server).split('.')[0].strip('.').lower() tld = self.psl.get_public_suffix(server).replace(d, '', 1).strip('.').lower() subs = server.replace('.'.join(['', d, tld]), '', 1).lower() self._log_this(domain, 'subs: %s, d: %s, tld: %s' % (subs, d, tld)) w = self.get_whois("%s.%s" % (d, tld), tld) auths.append({'tld': tld, 'domain': d, 'subs': subs, 'whois': w}) return auths def get_domain(self, domain): ''' given a domain, find: - IPv4s and TTLs - whois information - nameserver: - hostnames - IPv4s - whois information ''' self._log_this(domain, 'received domain call') publicsuffix = self.psl.get_public_suffix(domain) dn_domain = publicsuffix.split('.')[0] dn_tld = '.'.join(publicsuffix.split('.')[1:]) dn_subs = domain.replace('.'.join([dn_domain, dn_tld]), '', 1).split('.') w = self.get_whois(domain, dn_tld) dn_ips = self.get_ipv4s(domain) # try the nameserver found by DNS queries, then the nameservers from whois, then just skip it try: nameservers = set([each['ns_name'] for each in dn_ips]) except: try: nameserver = set(w['nameservers']) except: nameservers = set([]) dn_authorities = self.get_authoritative_domains(domain, nameservers) self._log_this(domain, 'sending results') now = self.get_now() return {'ts': now, 'tld': dn_tld, 'domain': dn_domain, 'subs': dn_subs, 'ips': dn_ips, 'whois': w, 'request': domain, 'authorities': dn_authorities}