def _images(self, task, page, match, images): debug(self._ID, 2, "%s: scanning images in %s", task.key, task.url) img = 0 for imgtag in re.findall(self._rximg, page.replace("\n", " ")): while True: m = self._rxattr.match(imgtag) if not m: break arg = m.group(2) if len(arg) >= 2 and arg[0] == '"' and arg[-1] == '"': arg = arg[1:-1] elif len(arg) >= 2 and arg[0] == "'" and arg[-1] == "'": arg = arg[1:-1] if m.group(1) == "src" and re.search(match, arg): arg = arg.replace("&", "&").replace(" ", "%20") url = urlparse.urljoin(task.url, arg) key = task.key[:-1] + (images[img], ) debug(self._ID, 2, "%s: retrieving image #%d %s from %s (%s)", key, img + 1, images[img], url, arg) self._reqman.put( Task(url, key, task.period, "image/*", self._pngdata, None)) img += 1 imgtag = imgtag[m.end():] if img != len(images): cherrypy.log("SCRAPER WARNING %s found %d of %d images" % (task.url, img, len(images))) return "x"
def _images(self, task, page, match, images): debug(self._ID, 2, "%s: scanning images in %s", task.key, task.url) img = 0 for imgtag in re.findall(self._rximg, page.replace("\n", " ")): while True: m = self._rxattr.match(imgtag) if not m: break arg = m.group(2) if len(arg) >= 2 and arg[0] == '"' and arg[-1] == '"': arg = arg[1:-1] elif len(arg) >= 2 and arg[0] == "'" and arg[-1] == "'": arg = arg[1:-1] if m.group(1) == "src" and re.search(match, arg): arg = arg.replace("&", "&").replace(" ", "%20") url = urlparse.urljoin(task.url, arg) key = task.key[:-1] + (images[img],) debug(self._ID, 2, "%s: retrieving image #%d %s from %s (%s)", key, img+1, images[img], url, arg) self._reqman.put(Task(url, key, task.period, "image/*", self._pngdata, None)) img += 1 imgtag = imgtag[m.end():] if img != len(images): cherrypy.log("SCRAPER WARNING %s found %d of %d images" % (task.url, img, len(images))) return "x"
def _asn_lookup_cc(self, info, task, tasks): """Perform final step of AS lookups, verifying country code for the autonomous system from cymru.com database using DNS lookups. This is more accurate than the code returned by initial AS lookups.""" debug("IP2INFO", 2, "asn lookup/cc %s %s", info.ip, info.asn.asn) # Double check ASN lookup was really successful. if not info.asn.asn or not info.asn.asn.isdigit(): return True # Define responder to country code lookup from cymru.com. Expects # 1-tuple answer matching RX_ASN_CC. Parse the one reply received. def responder(answer, addr): debug("IP2INFO", 3, "cc result %s from %s: %s", addr, info.asn.asn, answer) self._tick_stats(answer[0]) task.ongoing = False if len(answer[3]) > 0 and len(answer[3][0]) == 1: m = RX_ASN_CC.match(answer[3][0][0]) if m and m.group(1) == info.asn.asn: debug("IP2INFO", 2, "cc assigning %s = %s", m.group(1), m.group(2)) info.asn.cc = m.group(2) info.asn.rir = m.group(3) info.asn.date = m.group(4) info.asn.org = m.group(5) info.asn.desc = m.group(7) task.done = True debug("IP2INFO", 3, "submitting asn lookup %s", info.asn.asn) self._submit("as%s.asn.cymru.com" % info.asn.asn, rr.TXT, responder) return False
def _wild_lookup(self, info, task, tasks): """For addresses we have failed to reverse lookup, and failed to reverse lookup CIDR base address, try other addresses in the same CIDR block. If the CIDR is narrower than /24, scan it entirely, and otherwise scan the nearest /24 segment. Remember which ever name we first come up with.""" debug("IP2INFO", 2, "wild lookup %s %s", info.ip, info.wildhost) if info.domain or info.hostname or info.cidrhost or info.wildhost: return True # FIXME: Handle IPv6 here. cidrw = (info.cidr.prefixlen >= 24 and info.cidr.prefixlen) or 25 addrs = [xip for xip in IPNetwork("%s/%d" % (info.ip, cidrw))] # Define responder to handle results for nearby address scan. # Remember only the first result we receive. def responder(hostname, ip): debug("IP2INFO", 2, "wild result %s -> %s -> %s %d", info.ip, ip, hostname, len(addrs)) addrs.remove(ip) task.ongoing = (len(addrs) > 0) if hostname != None: task.done = True task.ongoing = False if not info.wildhost: info.wildhost = hostname debug("IP2INFO", 2, "wild hostname found %s: %s", info.ip, info.wildhost) for xip in addrs[:]: debug("IP2INFO", 3, "wild hostname lookup %s", xip) self._submit_ptr(responder, xip, rr.PTRraw) return False
def scrape(self, section, url, images=None, match=None, period=900, urledit=None): """ Register a HTML page to scrape for images matching a pattern. Images will be available for retrieval via `image()` using the names listed in `images`. If `images` and `match` are None, then scrapes images directly, otherwise scrapes images off an HTML page. :arg str section: identifier for images from this address :arg str url: html address where to retrieve page or image :arg callable urledit: dynamically modify url before request :arg list(str) images: list of image names :arg re match: regular expression to match image(s) :arg int period: interval in seconds between checks """ debug(self._ID, 1, "%s: scrape %s, images %s match %s period %d", section, url, images, match, period) if match: ContentScraper.scrape(self, section, { "page": (url, urledit) }, period = period, content_type = "text/html", convert = lambda t,c,v: \ self._images(t, v, match, images)) else: ContentScraper.scrape(self, section[:-1], {section[-1]: (url, urledit)}, period=period, content_type="image/*", convert=self._pngdata)
def reload(self): """Read the public suffix list. Reloads the database from upstream database if there is no locally cached file, or the file is too old. Saves the cached data in YAML format, with punycoded names. If the cache file exists and is valid, it is loaded as is. Please note that py2-yaml used for reading and writing YAML should have been built with libyaml for this to perform optimally. """ if not os.path.exists(self.path) \ or time() - os.stat(self.path)[ST_MTIME] >= 15 * 86400: debug("PSL", 1, "downloading fresh psl database") self._parse(urllib2.urlopen(self.url)) newpath = self.path + ".new" newfile = open(newpath, "w") newfile.write(yaml.dump(self.psl, Dumper = YAMLDumper)) newfile.close() if os.path.exists(self.path): os.remove(self.path) os.rename(newpath, self.path) else: debug("PSL", 1, "reading psl database %s", self.path) self.psl = yaml.load(open(self.path).read(), YAMLLoader)
def scrape(self, section, urls, content_type="application/json", period=900, convert=None): """ Register URLs for scraping content. Usually the content is JSON but it can be something else too, like HTML. All the URLs will be fetched, converted using `convert` and stored. :arg str section: label for this item :arg int period: interval in seconds between checks :arg str content_type: expected content type in response :arg callable convert: response conversion, e.g. cjson.decode :arg dict urls: (title, url) or (title, (url, urledit)) of data to retrieve """ debug(self._ID, 1, "%s: scrape %s, period %d, content type %s", section, urls, period, content_type) with self._cv: if isinstance(section, basestring): section = (section, ) map(lambda title: self._put(section + (title, ), 0, None), urls.keys()) self._scrape.append({ "section": section, "period": period, "content_type": content_type, "urls": urls, "convert": convert }) self._cv.notifyAll()
def _submit_ptr(self, callback, ip, type = rr.PTR): """Submit a DNS reverse lookup query, but avoid making duplicates. If there is already a query ongoing for the given reverse IP address, no new query is submitted but the address is cached in `self.ptrmap` so the callback knows to consider this new association. The purpose is to help caller make multiple reverse IP address queries for a given "destination" address, and avoid any excess ones when making network address scans for multiple "origin" address. Addresses which fail reverse lookup with permanent error such as NXDOMAIN are remembered. Future queries on those addresses are short-circuited and immediately invoke the `callback` without issuing a new DNS query. @param callback -- function to call back if the query is answered @param ip -- the reversed IP address to look up @param info -- the forward IP address this query updates @param type -- request record type (adns.rr.*). """ # If known to fail, skip. if ip in self.ptrfail: callback(None, ip) return # Add to pending list of ptr lookups if ip not in self.ptrmap: self.ptrmap[ip] = [] self.ptrmap[ip].append(callback) # Create DNS query if this is first pending lookup for this address. if len(self.ptrmap[ip]) == 1: debug("IP2INFO", 3, "submitting ptr lookup of %s", ip) self._submit(ip.reverse_dns, type, self._ptr_result, ip)
def domain(self, hostname): """Translate host name to a domain name using the public suffix list. @param -- string, the host name to look up @return string with the domain portion of `hostname`. """ dir = self.psl domain = [] insuffix = True parts = hostname.split(".")[::-1] first = True while len(parts): debug("PSL", 3, "hostname %s domain %s parts %s insuffix %s dir %s", hostname, domain, parts, insuffix, (first and "{(all)}") or dir) part = parts.pop(0) if insuffix: domain.append(part) first = False if part in dir: if isinstance(dir[part], dict): dir = dir[part] else: insuffix = not dir[part] dir = {} elif "*" in dir: insuffix = not dir["*"] dir = {} else: break domname = ".".join(domain[::-1]) debug("PSL", 2, "hostname %s mapped to domain %s", hostname, domname) return domname
def process(self, waittime = 1): """Process DNS queries and callbacks for up to `waittime` seconds.""" # Wait for any queries to complete. Issue any new tasks created. now = time() npending = self._issue() num = len(self.queries) until = now + waittime prevtime = now while len(self.queries): ndone = 0 for q in self.res.completed(.25): (created, callback, args) = self.queries[q] del self.queries[q] callback(q.check(), *args) ndone +=1 if ndone > 0: npending = self._issue() # See if we should quit. Throttle back if 'completed()' returned # quickly and we are busy looping. xnow = time() if xnow > until: break if xnow - prevtime < 0.1 and (npending or len(self.queries)): sleep(min(0.5, until - xnow)) prevtime = xnow # Expire whatever was running too long, and report timing. self._timeout() debug("IP2INFO", 2, "processed %d dns queries in %.2f s," " %d remain, %d pending", num, time() - now, len(self.queries), npending) return npending
def _cname_lookup(self, info, task, tasks): """Issue canonical name lookup for a host name. For the occasional poorly configured sites with CNAME linked to another CNAME, issues a few levels of recursive requests to get to a final host name.""" debug("IP2INFO", 2, "cname lookup %s %s", info.hostname, info.cnames) # Report ready if we already have a result. if info.cnames: return True # Do DNS CNAME lookup. def responder(answer, addr): debug("IP2INFO", 2, "cname result %s from %s: %s", addr, info.hostname, answer) self._tick_stats(answer[0]) task.done = True task.ongoing = False if addr not in info.cnames: info.cnames[addr] = [] for cname in answer[3]: info.all_names.update((cname,)) cnames = info.cnames[addr] cnames.append(cname) if len(cnames) < 5: self._submit(cname, rr.CNAME, responder) task.ongoing = True task.done = False self._submit(info.hostname, rr.CNAME, responder) return False
def _addr_lookup(self, info, task, tasks): """Issue forward name lookup for a host name. Issues A requests for the original host name and all CNAMEs discovered. All successfully looked up addresses get their own reverse IP lookup process.""" debug("IP2INFO", 2, "addr lookup %s %s %s", info.hostname, info.addrs, info.all_names) # Report ready if we already have a result. if info.addrs: return True # Do DNS forward lookup for hostname and all CNAMEs. def responder(answer, name): debug("IP2INFO", 2, "addr result %s from %s: %s", name, info.hostname, answer) self._tick_stats(answer[0]) if name not in info.addrs: info.addrs[name] = [] for ipstr in answer[3]: info.all_addrs.update((ipstr,)) ip = IPAddress(ipstr) info.addrs[name].append(ip) self._insert_lookup(ip, info) task.done = (len(info.addrs) == len(info.all_names)) task.ongoing = not task.done if task.done and answer[0] > adns.status.max_misconfig and not info.all_addrs: tasks.append(IPTask(self._notify_hosts, 1)) for name in info.all_names: self._submit(name, rr.A, responder) return False
def scrape(self, section, url, images=None, match=None, period=900, urledit=None): """ Register a HTML page to scrape for images matching a pattern. Images will be available for retrieval via `image()` using the names listed in `images`. If `images` and `match` are None, then scrapes images directly, otherwise scrapes images off an HTML page. :arg str section: identifier for images from this address :arg str url: html address where to retrieve page or image :arg callable urledit: dynamically modify url before request :arg list(str) images: list of image names :arg re match: regular expression to match image(s) :arg int period: interval in seconds between checks """ debug(self._ID, 1, "%s: scrape %s, images %s match %s period %d", section, url, images, match, period) if match: ContentScraper.scrape(self, section, { "page": (url, urledit) }, period = period, content_type = "text/html", convert = lambda t,c,v: \ self._images(t, v, match, images)) else: ContentScraper.scrape(self, section[:-1], { section[-1]: (url, urledit) }, period = period, content_type = "image/*", convert = self._pngdata)
def _reqinit(self, c, task): debug(self._ID, 2, "initialising request to %s (%s)", task.url, task.content_type) c.headers = [] c.setopt(pycurl.URL, task.url) c.setopt(pycurl.HEADERFUNCTION, c.headers.append) c.setopt(pycurl.HTTPHEADER, ["Accept: %s" % task.content_type, "Accept-Encoding: gzip, deflate"])
def responder(hostname, ip): debug("IP2INFO", 3, "cidr result %s -> %s", info.ip, hostname) task.ongoing = False if hostname != None: task.done = True info.cidrhost = hostname debug("IP2INFO", 2, "cidr hostname found %s: %s", info.ip, info.cidrhost)
def _reqinit(self, c, task): debug(self._ID, 2, "initialising request to %s (%s)", task.url, task.content_type) c.headers = [] c.setopt(pycurl.URL, task.url) c.setopt(pycurl.HEADERFUNCTION, c.headers.append) c.setopt(pycurl.HTTPHEADER, [ "Accept: %s" % task.content_type, "Accept-Encoding: gzip, deflate" ])
def _submit(self, addr, type, callback, *extra): """Submit a DNS query. @param addr -- the address to look up @param type -- request record type (adns.rr.*) @param callback -- function to call back if the query is answered @param extra -- additional arguments to `callback`. """ debug("IP2INFO", 3, "submitting lookup of %s, type %d", addr, type) self.queries[self.res.submit(addr, type)] = \ (time(), callback, (addr,) + extra)
def _asn_lookup_1(self, info, task, tasks): """Perform first step of ASN lookup, by checking reserved addresses.""" debug("IP2INFO", 2, "asn lookup/reserved %s %s", info.ip, info.asn.asn) if info.asn == self.NULL_ASN: resv = self._reserved(info.ip) if resv: info.cidr = resv info.domain = str(resv) info.asn = self.asnmap["@%s" % info.domain] return True
def _geoip_lookup(self, info, task, tasks): """Perform GeoIP lookup for an IP address.""" debug("IP2INFO", 2, "geoip lookup %s %s", info.ip, info.geoip) # Report ready if we already have a result. if info.geoip != GeoIPLookup.NULL_GEOIP: return True # Lookup GeoIP info. info.geoip = self.gip.lookup(info.ip) return True
def responder(hostname, ip): debug("IP2INFO", 2, "wild result %s -> %s -> %s %d", info.ip, ip, hostname, len(addrs)) addrs.remove(ip) task.ongoing = (len(addrs) > 0) if hostname != None: task.done = True task.ongoing = False if not info.wildhost: info.wildhost = hostname debug("IP2INFO", 2, "wild hostname found %s: %s", info.ip, info.wildhost)
def _ptr_result(self, answer, addr, ip): """Respond to PTR query results.""" debug("IP2INFO", 3, "ptr result %s %s %s", addr, ip, answer) self._tick_stats(answer[0]) if answer[0] > adns.status.max_tempfail: # permanent failure, remember not to ask again debug("IP2INFO", 3, "blacklisting %s %s (%d)", ip, _adns_status_name_of(answer[0]), answer[0]) self.ptrfail.add(ip) hostname = (len(answer[3]) > 0 and answer[3][0].lower()) or None for callback in self.ptrmap[ip]: callback(hostname, ip) del self.ptrmap[ip]
def _reqerror(self, c, task, errmsg, errno): result = getattr(task, "result", None) cherrypy.log(("CACHE ERROR %s request failed with error:" " %s (code %d), headers %s") % (getattr(task, "url", c.getinfo( pycurl.EFFECTIVE_URL)), errmsg, errno, c.headers)) if result: with result["signal"]: debug(self._ID, 2, "signaling error on %s, pending %d", task.url, result["pending"]) if not result["error"]: result["error"] = RuntimeError("http error %s (code %d)" % (errmsg, errno)) result["signal"].notifyAll()
def responder(answer, addr): debug("IP2INFO", 3, "cc result %s from %s: %s", addr, info.asn.asn, answer) self._tick_stats(answer[0]) task.ongoing = False if len(answer[3]) > 0 and len(answer[3][0]) == 1: m = RX_ASN_CC.match(answer[3][0][0]) if m and m.group(1) == info.asn.asn: debug("IP2INFO", 2, "cc assigning %s = %s", m.group(1), m.group(2)) info.asn.cc = m.group(2) info.asn.rir = m.group(3) info.asn.date = m.group(4) info.asn.org = m.group(5) info.asn.desc = m.group(7) task.done = True
def _reqerror(self, c, task, errmsg, errno): result = getattr(task, "result", None) cherrypy.log(("CACHE ERROR %s request failed with error:" " %s (code %d), headers %s") % (getattr(task, "url", c.getinfo(pycurl.EFFECTIVE_URL)), errmsg, errno, c.headers)) if result: with result["signal"]: debug(self._ID, 2, "signaling error on %s, pending %d", task.url, result["pending"]) if not result["error"]: result["error"] = RuntimeError("http error %s (code %d)" % (errmsg, errno)) result["signal"].notifyAll()
def responder(answer, name): debug("IP2INFO", 2, "addr result %s from %s: %s", name, info.hostname, answer) self._tick_stats(answer[0]) if name not in info.addrs: info.addrs[name] = [] for ipstr in answer[3]: info.all_addrs.update((ipstr,)) ip = IPAddress(ipstr) info.addrs[name].append(ip) self._insert_lookup(ip, info) task.done = (len(info.addrs) == len(info.all_names)) task.ongoing = not task.done if task.done and answer[0] > adns.status.max_misconfig and not info.all_addrs: tasks.append(IPTask(self._notify_hosts, 1))
def __init__(self, cachedir = None, gip = None, psl = None, res = None, maxtime = 30, maxtries = 3): """Constructor. Initialises the lookup object so it is ready for queries. @param cachedir -- Default location for caching databases, used if `gip` or `psl` have not been specified. If unset and neither `gip` nor `psl` arguments are provided, the databases are cached in the current directory. @param gip -- Reference to GeoIPLookup object. If None, a new object is created, using `cachedir` or current directory as the location for the city database file. @param psl -- Reference to PublicSuffixLookup object. If None, a new object is created, using `cachedir` or current directory as the location for the YAML cache database. @param res -- Reference to adns DNS resolver object. If None, a new resolver is created. If you want to use a nameserver other than your system default one, pass in custom adns object created with the appropriate "nameserver x.y.z.w" resolver argument. @param maxtime -- The maximum time to wait for DNS replies. Some DNS servers are slow to respond so some queries take a long time to complete, or will simply time out. If the client is submitting large numbers of addresses for query, the stragglers are handled automatically and there is no reason to reduce the query time-out. However if the client has just a few addresses to resolve, or is in a hurry to get the answer, set `maxtime` to some smaller value. @param maxtries -- The maximum number of times to attempt main queries per IP address. In general this value should be greater than one to avoid failures resulting from dropped DNS packets and to catch straggler responses from slow, far away and somewhat misconfigured DNS servers. More than three rarely improves the accuracy of the results. """ now = time() self.maxtime = maxtime self.maxtries = maxtries geopath = (cachedir and "%s/GeoLiteCity.dat" % cachedir) pslpath = (cachedir and "%s/psl.yml" % cachedir) self.res = res or adns.init(adns.iflags.noautosys) debug("IP2INFO", 2, "dns resolver initialised %.2f", time() - now) self.gip = gip or GeoIPLookup(path = geopath) debug("IP2INFO", 2, "geoip resolver initialised %.2f", time() - now) self.psl = psl or PublicSuffixLookup(path = pslpath) debug("IP2INFO", 2, "domain resolver initialised %.2f", time() - now) self.ptrfail = set() self.ptrmap = {} self.asnmap = self._asninit() self.queries = {} self.ipaddrs = {} self.notify = {} self.resstat = {} debug("IP2INFO", 1, "initialisation complete %.2f", time() - now)
def responder(answer, addr): debug("IP2INFO", 2, "cname result %s from %s: %s", addr, info.hostname, answer) self._tick_stats(answer[0]) task.done = True task.ongoing = False if addr not in info.cnames: info.cnames[addr] = [] for cname in answer[3]: info.all_names.update((cname,)) cnames = info.cnames[addr] cnames.append(cname) if len(cnames) < 5: self._submit(cname, rr.CNAME, responder) task.ongoing = True task.done = False
def _pngdata(self, task, c, imgdata): """Return image `data` as PNG image, using MIME type `format`. Returns `data` as is if `format` is image/png, otherwise converts the `data` into PNG format and returns that instead.""" ctype = c.getinfo(pycurl.CONTENT_TYPE) if not (ctype and ctype.startswith("image/")): cherrypy.log("SCRAPER ERROR %s content type '%s' not an image, headers %s" % (c.getinfo(pycurl.EFFECTIVE_URL), ctype, c.headers)) return None elif ctype != 'image/png': debug(self._ID, 3, "%s: converting image %s to png", task.key, ctype) png = StringIO() PILImage.open(StringIO(imgdata)).save(png, "PNG") imgdata = png.getvalue() png.close() return imgdata
def _notify_hosts(self, info, task, tasks): """Notify completely looked up host objects.""" assert isinstance(info, HostInfo) key = ("name", info.hostname) assert key in self.notify debug("IP2INFO", 2, "notify callbacks host %s %d %s %s", info.hostname, len(self.notify[key]), info.all_addrs, info.ipaddrs) assert len(info.all_addrs) == len(info.ipaddrs) debug("IP2INFO", 2, "notify callbacks host %s %d", info.hostname, len(self.notify[key])) callbacks = self.notify[key] del self.notify[key] for f in callbacks: f(info, None, 0) return True
def _timeout(self): """Cancel queries which have timed out after `self.maxtime`.""" # Scan for queries which have been going on for too long. now = time() expired = [] for q, info in self.queries.iteritems(): if now > info[0] + self.maxtime: expired.append((q, info)) # Now expire them. Call the callbacks so the tasks move on. debug("IP2INFO", 3, "cancelling %d timed out queries", len(expired)) for q, info in expired: (created, callback, args) = info del self.queries[q] q.cancel() callback((-1, None, None, tuple()), *args)
def _asn_lookup_3(self, info, task, tasks): """Perform third step of AS lookups for IP addresses by using cymru.com reverse mapping DNS servers.""" debug("IP2INFO", 2, "asn lookup/cymru %s %s", info.ip, info.asn.asn) # Report ready if we already have a result. if info.asn != self.NULL_ASN: return True # Define responder to cymru.com ASN lookup query. Expects 1-tuple # "ASN | CIDR | CC | RIR | YYYY-MM-DD" answer. Keeps the last # record of the answer received, it's the most specific CIDR. If # this creates ASInfo it will request ASN cc lookup too. def responder(answer, addr): debug("IP2INFO", 3, "cymru result %s from %s: %s", addr, info.ip, answer) self._tick_stats(answer[0]) task.ongoing = False if len(answer[3]) > 0 and len(answer[3][-1]) == 1: m = RX_ASN.match(answer[3][-1][0]) if m: task.done = True if m.group(1) in self.asnmap: debug("IP2INFO", 3, "cymru existing asn %s", m.group(1)) info.asn = self.asnmap[m.group(1)] else: debug("IP2INFO", 2, "cymru new asn %s, cidr %s, cc %s", m.group(1), m.group(2), m.group(3)) tasks.insert(1, IPTask(self._asn_lookup_cc, 2)) info.asn = self.asnmap[m.group(1)] = \ ASInfo(asn = m.group(1), cc = m.group(3), rir = m.group(4), date = m.group(5)) # Do reverse TXT lookup on IP address from cymru.com DNS. revaddr = info.ip.reverse_dns if revaddr.endswith(".in-addr.arpa."): rev = revaddr[:-14] + ".origin.asn.cymru.com" elif revaddr.endswith(".ip6.arpa."): rev = revaddr[:-10] + ".origin6.asn.cymru.com" else: assert False, "reverse address %s not recognised" % revaddr debug("IP2INFO", 3, "submitting asn lookup %s", rev) self._submit(rev, rr.TXT, responder) return False
def _pngdata(self, task, c, imgdata): """Return image `data` as PNG image, using MIME type `format`. Returns `data` as is if `format` is image/png, otherwise converts the `data` into PNG format and returns that instead.""" ctype = c.getinfo(pycurl.CONTENT_TYPE) if not (ctype and ctype.startswith("image/")): cherrypy.log( "SCRAPER ERROR %s content type '%s' not an image, headers %s" % (c.getinfo(pycurl.EFFECTIVE_URL), ctype, c.headers)) return None elif ctype != 'image/png': debug(self._ID, 3, "%s: converting image %s to png", task.key, ctype) png = StringIO() PILImage.open(StringIO(imgdata)).save(png, "PNG") imgdata = png.getvalue() png.close() return imgdata
def __init__(self, appconfig): debug(self._ID, 1, "creating new content cache") Thread.__init__(self, name="ContentCache") self._ssl = SSLOptions(key_file=appconfig.x509key, cert_file=appconfig.x509cert, ca_path=appconfig.x509cadir) self._reqman = RequestManager(num_connections=10, ssl_opts=self._ssl, user_agent=self._ident, handle_init=self._hinit, request_init=self._reqinit, request_error=self._reqerror, request_respond=self._reqdone) self._cv = Condition() self._stopme = False self._values = {} cherrypy.engine.subscribe('start', self.start) cherrypy.engine.subscribe('stop', self.stop, priority=100)
def _hostname_lookup(self, info, task, tasks): """Issue reverse name lookup for an IP address.""" debug("IP2INFO", 2, "hostname lookup %s %s %s", info.ip, info.hostname, info.domain) # Report ready if we already have a result. if info.hostname or info.domain: return True # Do DNS reverse hostname lookup. def responder(hostname, ip): debug("IP2INFO", 2, "hostname %s -> %s", info.ip, hostname) task.ongoing = False if hostname != None: info.hostname = hostname task.done = True self._submit_ptr(responder, info.ip, rr.PTRraw) return False
def _domain_lookup(self, info, task, tasks): """Look up domain part based on whatever name we managed to get.""" debug("IP2INFO", 2, "domain lookup %s %s %s", info.ip, info.hostname, info.domain) if not info.domain: if info.hostname: info.domain = self.psl.domain(info.hostname) elif info.cidrhost: info.domain = self.psl.domain(info.cidrhost) elif info.wildhost: info.domain = self.psl.domain(info.wildhost) elif info.asn and info.asn.asn: info.domain = "AS#%s (%s)" % (info.asn.asn, info.asn.org) if not info.hostname: info.hostname = str(info.ip) return True
def __init__(self, appconfig): debug(self._ID, 1, "creating new content cache") Thread.__init__(self, name = "ContentCache") self._ssl = SSLOptions(key_file = appconfig.x509key, cert_file = appconfig.x509cert, ca_path = appconfig.x509cadir) self._reqman = RequestManager(num_connections = 10, ssl_opts = self._ssl, user_agent = self._ident, handle_init = self._hinit, request_init = self._reqinit, request_error = self._reqerror, request_respond = self._reqdone) self._cv = Condition() self._stopme = False self._values = {} cherrypy.engine.subscribe('start', self.start) cherrypy.engine.subscribe('stop', self.stop, priority=100)
def purge(self): """Purge cached information and reload databases if possible.""" now = time() if self.queries: return for _, tasks in self.ipaddrs.values(): if tasks: return assert not self.ptrmap assert not self.queries assert not self.notify self.ptrfail = set() self.asnmap = self._asninit() self.ipaddrs = {} self.resstat = {} self.gip.reload() self.psl.reload() debug("IP2INFO", 1, "reload complete %.2f", time() - now)
def __call__(self, info, origin, remain): debug("HOSTDATA", 3, "replied to %s", self) if self.kind == "name": if isinstance(info, HostInfo): if remain: debug( "HOSTDATA", 2, "host %s: %d out of %d host addresses, waiting for remaining %d", info.hostname, len(info.ipaddrs), len(info.all_addrs), remain) else: assert info.hostname in self.pending self.pending.remove(info.hostname) debug( "HOSTDATA", 1, "host %s: all %d addresses resolved, %d requests remain", info.hostname, len(info.ipaddrs), len(self.pending)) with self.signal: if not self.result: self.result = [] self.result.append(self._hostinfo(info)) if not self.pending: self.signal.notifyAll() else: debug("HOSTDATA", 1, "%s: ignoring address update for %s", (origin and origin.hostname), info.ip) elif self.kind == "ip": assert isinstance(info, IPInfo) assert info.ip in self.pending assert not remain self.pending.remove(info.ip) debug("HOSTDATA", 1, "ip %s: address resolved, %d requests remain", info.ip, len(self.pending)) with self.signal: if not self.result: self.result = [] self.result.append(self._ipinfo(info)) if not self.pending: self.signal.notifyAll() else: assert False, "internal error, lookup neither host nor ip"
def run(self): with self._cv: while not self._stopme: npending = 0 ncurreq = len(self._requests) # Insert any new requests. If they fail, remember the error. for r in self._requests: if not r.reply.submitted: debug("HOSTDATA", 1, "submitting request: %s %s", r.kind, r.hosts) r.reply.submitted = True try: self._ip2i.submit(r.hosts, kind=r.kind, callback=r.reply) except Exception, e: r.reply.error = e # Pump any pending lookups for up to .25 seconds. Note that this # will wait only as long as needed, and will quit immediately # if there is no work at all. It's not unusual we need to wait # longer than this for final results; see the check further on. try: self._cv.release() npending = self._ip2i.process(.25) finally: self._cv.acquire() # Post-process requests. Remove fully completed, expired and # failed lookups from the request queue. nmodified = 0 now = time.time() for r in self._requests[:]: rr = r.reply if rr.finished: debug("HOSTDATA", 2, "request completed: %s %s", r.kind, r.hosts) self._requests.remove(r) nmodified += 1 elif rr.submitted and rr.until < now: debug("HOSTDATA", 1, "request has expired: %s %s", r.kind, r.hosts) self._requests.remove(r) with rr.signal: rr.error = RuntimeError( "maximum wait time exhausted") rr.signal.notifyAll() nmodified += 1 elif rr.submitted and rr.error: debug("HOSTDATA", 1, "request failed: %s %s", r.kind, r.hosts) self._requests.remove(r) with rr.signal: rr.signal.notifyAll() nmodified += 1 # Wait to be notified, but only if we don't already have work to do. skipwait = (self._stopme or npending or nmodified or len(self._requests) != ncurreq) debug("HOSTDATA", 2, ("wait for signal, %d pending, %d requests" " now vs. %d before, %d modified: %s"), npending, len(self._requests), ncurreq, nmodified, (skipwait and "skipping unnecessary wait") or "waiting") if not skipwait: if now - self._last_purge > self._PURGE_INTERVAL: self._purge() self._cv.wait((self._requests and 0.25) or None) debug("HOSTDATA", 2, "wait done")
def stop(self): debug("HOSTDATA", 1, "requesting to stop resolved thread") with self._cv: self._stopme = True self._cv.notifyAll()
def _purge(self): now = time.time() debug("HOSTDATA", 1, "purging address resolver") self._last_purge = time.time() self._ip2i.purge()
class HostCache(Thread): """Utility to resolve host information.""" _PURGE_INTERVAL = 4 * 3600 _NUM_SIGS = 8 def __init__(self, statedir): Thread.__init__(self, name="HostCache") self._ip2i = IPResolver(cachedir=statedir, maxtime=15) self._cv = Condition() self._stopme = False self._requests = [] self._last_purge = time.time() self._signals = map(lambda x: Condition(), xrange(0, self._NUM_SIGS)) cherrypy.engine.subscribe('start', self.start) cherrypy.engine.subscribe('stop', self.stop, priority=100) def _purge(self): now = time.time() debug("HOSTDATA", 1, "purging address resolver") self._last_purge = time.time() self._ip2i.purge() def statistics(self): with self._cv: return self._ip2i.statistics() def reset_statistics(self): with self._cv: self._ip2i.reset_statistics() def purge(self): with self._cv: self._purge() def stop(self): debug("HOSTDATA", 1, "requesting to stop resolved thread") with self._cv: self._stopme = True self._cv.notifyAll() def lookup(self, kind, hosts, maxwait=30): """ Lookup information either by IP address or host name. :arg str kind: "ip" or "name" :arg list hosts: list of host name string, ip address or a real name :arg float maxwait: maximum time in seconds to wait for a result. """ reply = Reply() reply.kind = kind reply.until = time.time() + maxwait reply.signal = random.choice(self._signals) reply.pending = set(hosts) with self._cv: self._requests.append(Task(kind, hosts, reply)) self._cv.notifyAll() with reply.signal: while True: if self._stopme: raise RuntimeError("server stopped") elif reply.error: raise reply.error elif not reply.pending: reply.finished = True return reply.result else: reply.signal.wait() def run(self): with self._cv: while not self._stopme: npending = 0 ncurreq = len(self._requests) # Insert any new requests. If they fail, remember the error. for r in self._requests: if not r.reply.submitted: debug("HOSTDATA", 1, "submitting request: %s %s", r.kind, r.hosts) r.reply.submitted = True try: self._ip2i.submit(r.hosts, kind=r.kind, callback=r.reply) except Exception, e: r.reply.error = e # Pump any pending lookups for up to .25 seconds. Note that this # will wait only as long as needed, and will quit immediately # if there is no work at all. It's not unusual we need to wait # longer than this for final results; see the check further on. try: self._cv.release() npending = self._ip2i.process(.25) finally: self._cv.acquire() # Post-process requests. Remove fully completed, expired and # failed lookups from the request queue. nmodified = 0 now = time.time() for r in self._requests[:]: rr = r.reply if rr.finished: debug("HOSTDATA", 2, "request completed: %s %s", r.kind, r.hosts) self._requests.remove(r) nmodified += 1 elif rr.submitted and rr.until < now: debug("HOSTDATA", 1, "request has expired: %s %s", r.kind, r.hosts) self._requests.remove(r) with rr.signal: rr.error = RuntimeError( "maximum wait time exhausted") rr.signal.notifyAll() nmodified += 1 elif rr.submitted and rr.error: debug("HOSTDATA", 1, "request failed: %s %s", r.kind, r.hosts) self._requests.remove(r) with rr.signal: rr.signal.notifyAll() nmodified += 1 # Wait to be notified, but only if we don't already have work to do. skipwait = (self._stopme or npending or nmodified or len(self._requests) != ncurreq) debug("HOSTDATA", 2, ("wait for signal, %d pending, %d requests" " now vs. %d before, %d modified: %s"), npending, len(self._requests), ncurreq, nmodified, (skipwait and "skipping unnecessary wait") or "waiting") if not skipwait: if now - self._last_purge > self._PURGE_INTERVAL: self._purge() self._cv.wait((self._requests and 0.25) or None) debug("HOSTDATA", 2, "wait done") debug("HOSTDATA", 1, "server thread stopped")
def run(self): with self._cv: while not self._stopme: debug(self._ID, 1, "executing scrape cycle") now = time.time() for s in self._scrape: for title, url in s["urls"].iteritems(): key = s["section"] + (title, ) debug(self._ID, 3, "%s: considering %s", key, url) _, val = self._get(key) if val.expires < now: if isinstance(url, tuple): url, urledit = url if urledit: url = urledit(url) debug(self._ID, 2, "%s: refetching expired %s (%.2f ago)", key, url, now - val.expires) self._reqman.put( Task(url, key, s["period"], s["content_type"], s["convert"], None)) debug(self._ID, 1, "processing requests") self._reqman.process(lock=self._cv.acquire, unlock=self._cv.release) debug(self._ID, 1, "waiting") if not self._stopme: self._cv.wait(30) debug(self._ID, 1, "wait done") debug(self._ID, 1, "server thread stopped")
def __init__(self, appconfig): debug(self._ID, 1, "creating new content scraper") ContentCache.__init__(self, appconfig) self._scrape = []
def _reqdone(self, c): result = c.task.result try: code = c.getinfo(pycurl.HTTP_CODE) debug(self._ID, 2, "request done %s => http %d", c.task.url, code) if code != 200: raise RuntimeError("http response %d from %s" % (code, c.task.url)) value = c.buffer.getvalue() for h in c.headers: m = RX_CONTENT_ENCODING.match(h) if m: enc = m.group(1) if enc == "deflate": debug(self._ID, 3, "decompressing deflated content") value = zlib.decompress(value, -zlib.MAX_WBITS) elif enc == "gzip": debug(self._ID, 3, "decompressing gzipped content") value = gzip.GzipFile(fileobj=StringIO(value)).read() else: cherrypy.log( "WARNING: ignoring content encoding %s for %s" % (enc, c.task.url)) if c.task.convert: debug(self._ID, 3, "converting value for %s, len %d", c.task.url, len(value)) value = c.task.convert(c.task, c, value) if value: debug(self._ID, 1, "storing value for %s into %s, expires %d", c.task.url, c.task.key, c.task.period) self._store(c.task, value) if result: with result["signal"]: debug(self._ID, 2, "signaling result on %s, pending %d", c.task.url, result["pending"]) assert result["pending"] > 0 result["pending"] -= 1 if result["pending"] == 0: result["signal"].notifyAll() except Exception, e: cherrypy.log(("CACHE ERROR %s processing failed with error:" " %s, headers %s") % (c.task.url, str(e), c.headers)) for line in traceback.format_exc().rstrip().split("\n"): cherrypy.log(" " + line) if result: with result["signal"]: debug(self._ID, 2, "signaling error on %s, pending %d", c.task.url, result["pending"]) if not result["error"]: result["error"] = e result["signal"].notifyAll()
def run(self): with self._cv: while not self._stopme: debug(self._ID, 1, "processing requests") self._reqman.process(lock=self._cv.acquire, unlock=self._cv.release) debug(self._ID, 1, "purging values") self._purge(time.time(), None, None, self._values) debug(self._ID, 1, "waiting") if not self._stopme: self._cv.wait() debug(self._ID, 1, "wait done") debug(self._ID, 1, "server thread stopped, waking waiters") for s in self._signals: with s: s.notifyAll() debug(self._ID, 1, "server thread stopped")
def __init__(self, appconfig): debug(self._ID, 1, "creating new content proxy") ContentCache.__init__(self, appconfig) self._signals = map(lambda x: Condition(), xrange(0, self._NUM_SIGS))
def fetch(self, section, expires, urls, content_type="application/json", convert=None, merge=None): """ Retrieve data from URLs, caching it locally for `expires` seconds. Usually the content is JSON but it can be something else too, like HTML. All the URLs will be fetched, converted using `convert`, stored, then merged to a new value with `merge`. :arg str section: label for this item :arg int expires: maximum time to cache the responses :arg str content_type: expected content type in response :arg callable convert: response conversion, e.g. cjson.decode :arg callable merge: reply post-processor :arg dict urls: (title, url) or (title, (url, urledit)) of data to retrieve """ debug(self._ID, 1, "%s: fetch from %s, expires %d, content type %s", section, urls, expires, content_type) if len(urls) > 1 and not merge: raise ValueError("merge needed to reduce %s from %s" % (section, urls)) if not merge: merge = lambda group: group[urls.keys()[0]].data if isinstance(section, basestring): section = (section, ) now = time.time() merged = section + ("merged", ) signal = self._signals[(hash(merged) >> 24) % self._NUM_SIGS] reply = {"pending": 0, "error": None, "signal": signal} with self._cv: if not self._has(merged): debug(self._ID, 2, "%s: inserting null value", merged) self._put(merged, 0, None) for title, url in urls.iteritems(): key = section + (title, ) if self._has(key): _, val = self._get(key) if val.expires >= now: debug(self._ID, 2, "%s: valid value for %s", key, url) continue else: debug(self._ID, 2, "%s: inserting null value for %s", key, url) self._put(key, 0, None) if isinstance(url, tuple): url, urledit = url if urledit: url = urledit(url) reply["pending"] += 1 self._reqman.put( Task(url, key, expires, content_type, convert, reply)) debug(self._ID, 2, "%s: requested %s", key, url) if reply["pending"]: debug(self._ID, 3, "%s: signaling requests", section) self._cv.notifyAll() with signal: while True: if self._stopme: debug(self._ID, 3, "%s: reply cancelled for stop", merged) raise RuntimeError("server stopped") elif reply["error"]: debug(self._ID, 2, "%s: reply was an error", merged) raise reply["error"] elif not reply["pending"]: debug(self._ID, 2, "%s: reply complete", merged) break else: debug(self._ID, 3, "%s: waiting for reply", merged) signal.wait() with self._cv: newval = None now = time.time() if not self._has(merged): # unlikely but possible it got removed debug(self._ID, 2, "%s: replacing lost key", merged) self._put(merged, 0, None) group, val = self._get(merged) if val.expires >= now: debug(self._ID, 1, "%s: returning valid value", merged) return val.data else: debug(self._ID, 2, "%s: merging new value", merged) newval = merge(group) self._put(merged, now + expires, newval) return newval