def application(env, start_response): """ WSGI entry point. :param env: the environment variables from the http server. :param start_response: the function that will trigger the response. :return: the response. """ req_path = env.get("REQUEST_URI", "/") req_datetime = env.get("HTTP_ACCEPT_DATETIME") accept_datetime = dateparser.parse(now()) if req_datetime: try: accept_datetime = dateparser.parse(req_datetime) if accept_datetime.tzinfo is None or \ accept_datetime.tzinfo.utcoffset(accept_datetime) is None: # Naive date. Reparse with Timezone req_datetime += " GMT" accept_datetime = dateparser.parse(req_datetime) except Exception as e: accept_datetime = None if not req_path.startswith("/"): req_path = "/" + req_path mem_proxy = MementoProxy() if mem_proxy.path: req_path = req_path.replace(mem_proxy.path, "") req_proxy = req_path.split("/")[0] if req_proxy in mem_proxy.proxies: module_path = "proxy." + mem_proxy.proxies.get(req_proxy) module = importlib.import_module(module_path) class_str = mem_proxy.proxies.get(req_proxy) class_str = class_str[0].upper() + class_str[1:] proxy_class = getattr(module, class_str) proxy = proxy_class(env, start_response) proxy.proxy_part = req_proxy req_serv = req_path.replace(req_proxy + "/", "", 1) if req_serv.startswith(mem_proxy.timegate_url_part): req_url = req_serv.replace(mem_proxy.timegate_url_part, "", 1)[1:] req_url = process_req_url(req_url, proxy) if req_proxy.find("wiki") >= 0: return proxy.handle_timegate(req_url, accept_datetime, wiki=True) else: return proxy.handle_timegate(req_url, accept_datetime) elif req_serv.startswith(mem_proxy.timemap_url_part): req_url = req_serv.replace(mem_proxy.timemap_url_part, "", 1)[1:] req_url = req_url.replace(mem_proxy.timemap_link_url_part, "", 1)[1:] req_url = process_req_url(req_url, proxy) return proxy.handle_timemap(req_url) start_response("404 Not Found", [('Content-Type', 'text/html')]) return ["Requested resource not found."]
def fetch_memento(self, req_url, dt=None): changes = [] valid = re.compile('^(http://|https://)(.+.wikipedia.org)') match = valid.match(req_url) default_protocol = "http://" dtfmstr = "%Y%m%d%H%M%S" if match is None: return if not dt: dt = dateparser.parse(now()) dt_del = timedelta(seconds=1) dt_next = dt + dt_del dt_next = dt_next.strftime(dtfmstr) dt = dt.strftime(dtfmstr) title_index = string.find(req_url, '/wiki/') title = req_url[title_index + 6:] url_list = [] # url for getting the memento, prev mem_prev = "%s%s/w/api.php?format=xml&action=query&prop=revisions&rvprop=timestamp|ids|user&rvlimit=2&redirects=1&titles=%s&rvdir=older&rvstart=%s" % \ (default_protocol, match.groups()[1], title, dt) url_list.append('mem_prev') # url for next if dt_next: next = "%s%s/w/api.php?format=xml&action=query&prop=revisions&rvprop=timestamp|ids|user&rvlimit=2&redirects=1&titles=%s&rvdir=newer&rvstart=%s" % \ (default_protocol, match.groups()[1], title, dt) url_list.append('next') # url for last last = "%s%s/w/api.php?format=xml&action=query&prop=revisions&rvprop=timestamp|ids|user&rvlimit=1&redirects=1&titles=%s" % \ (default_protocol, match.groups()[1], title) url_list.append('last') # url for first first = "%s%s/w/api.php?format=xml&action=query&prop=revisions&rvprop=timestamp|ids|user&rvlimit=1&redirects=1&rvdir=newer&titles=%s" % \ (default_protocol, match.groups()[1], title) url_list.append('first') base = "%s%s/w/index.php?title=%s&oldid=" % (default_protocol, match.groups()[1], title) for url in url_list: revs = [] dom = self.get_xml(vars()[url]) try: revs = dom.xpath('//rev') except Exception: return for r in revs: info = {} try: info['dcterms:creator'] = '%s%s/wiki/User:%s' % \ (default_protocol, match.groups()[1], r.attrib['user']) except Exception: pass info['type'] = 'valid' dtobj = dateparser.parse(r.attrib['timestamp']) info['last'] = dtobj # unknown usage... but likely loads info['obs'] = 0 changes.append((dtobj, base + r.attrib['revid'], info)) if changes: changes.sort() changes[-1][-1]['last'] = 'now' return changes
def fetch_memento(self, req_url, dt=None): p = urlparse(req_url) host = p[1] upath = p[2] if host.find('.wikia.com') == -1 and not host in self.hosts: return (pref, title) = upath.rsplit('/', 1) if pref: # look for /wiki pref = pref.replace('/wiki', '') changes = [] defaultProtocol = "http://" dtfmstr = "%Y%m%d%H%M%S" dt_next = False if dt is None: nowd = now() current = dateparser.parse(nowd) dt = current.strftime(dtfmstr) else: dt_del = timedelta(seconds=1) dt_next = dt + dt_del dt_next = dt_next.strftime(dtfmstr) dt = dt.strftime(dtfmstr) url_list = [] # url for getting the memento, prev mem_prev = "%s%s/api.php?format=xml&action=query&prop=revisions&rvprop=timestamp|ids|user&rvlimit=2&redirects=1&titles=%s&rvdir=older&rvstart=%s" % (defaultProtocol, host, title, dt) url_list.append('mem_prev') # url for next if dt_next: next = "%s%s/api.php?format=xml&action=query&prop=revisions&rvprop=timestamp|ids|user&rvlimit=2&redirects=1&titles=%s&rvdir=newer&rvstart=%s" % (defaultProtocol, host, title, dt) url_list.append('next') # url for last last = "%s%s/api.php?format=xml&action=query&prop=revisions&rvprop=timestamp|ids|user&rvlimit=1&redirects=1&titles=%s" % (defaultProtocol, host, title) url_list.append('last') # url for first first = "%s%s/api.php?format=xml&action=query&prop=revisions&rvprop=timestamp|ids|user&rvlimit=1&redirects=1&rvdir=newer&titles=%s" % (defaultProtocol, host, title) url_list.append('first') #url = url % (title, dt) base = "%s%s%s/index.php?title=%s&oldid=" % \ (defaultProtocol, host, pref, title) dtobj = None hdrs = self.hdrs hdrs['Host'] = host for url in url_list: dom = self.get_xml(vars()[url], headers=hdrs) revs = dom.xpath('//rev') for r in revs: info = {} try: info['dcterms:creator'] = '%s%s%s/wiki/User:%s' %\ (defaultProtocol, host, pref, r.attrib['user']) except: pass info['type'] = 'valid' dtobj = dateparser.parse(r.attrib['timestamp']) info['last'] = dtobj # unknown usage... but likely loads info['obs'] = 0 changes.append((dtobj, base + r.attrib['revid'], info)) if changes: changes.sort() changes[-1][-1]['last'] = 'now' return changes
def fetch_memento(self, req_url, dt=None): changes = [] if req_url.startswith("//"): req_url = req_url.replace("//", "http://") #valid = re.compile('^(http://|https://)(.+)') #match = valid.match(requri) #defaultProtocol = "http://" dtfmstr = "%Y%m%d%H%M%S" parsed_url = urlparse.urlparse(req_url) headers = self.hdrs headers['Host'] = parsed_url[1] dt_next = False if dt is None: nowd = now() current = dateparser.parse(nowd) dt = current.strftime(dtfmstr) else: dt_del = timedelta(seconds=1) dt_next = dt + dt_del dt_next = dt_next.strftime(dtfmstr) dt = dt.strftime(dtfmstr) title = None api_base_url = None try: title = urlparse.parse_qs(parsed_url[4]).get('title') except: pass try: dom = self.get_xml(req_url, headers=headers, html=True) except Exception as e: return links = dom.xpath("//link") for link in links: if link.attrib['rel'].lower() == "edituri": api_base_url = link.attrib['href'].split("?")[0] if api_base_url.startswith("//"): api_base_url = api_base_url.replace("//", "http://") if not title: url_parts = req_url.split("/") title = url_parts[len(url_parts) - 1].split("?")[0] url_list = [] # url for getting the memento, prev mem_prev = "%s?format=xml&action=query&prop=revisions&rvprop=timestamp|ids|user&rvlimit=2&redirects=1&titles=%s&rvdir=older&rvstart=%s" % (api_base_url, title, dt) url_list.append('mem_prev') # url for next if dt_next: next = "%s?format=xml&action=query&prop=revisions&rvprop=timestamp|ids|user&rvlimit=2&redirects=1&titles=%s&rvdir=newer&rvstart=%s" % (api_base_url, title, dt) url_list.append('next') # url for last last = "%s?format=xml&action=query&prop=revisions&rvprop=timestamp|ids|user&rvlimit=1&redirects=1&titles=%s" % (api_base_url, title) url_list.append('last') # url for first first = "%s?format=xml&action=query&prop=revisions&rvprop=timestamp|ids|user&rvlimit=1&redirects=1&rvdir=newer&titles=%s" % (api_base_url, title) url_list.append('first') base = "%s?title=%s&oldid=" % (api_base_url.replace("api.php", "index.php"), title) dtobj = None for url in url_list: dom = None try: dom = self.get_xml(vars()[url]) except Exception as e: pass if not dom: continue dom = dom.getroot() revs = dom.xpath('//rev') for r in revs: info = {} try: info['dcterms:creator'] = '%s/wiki/User:%s' %\ (api_base_url, r.attrib['user']) except: pass info['type'] = 'valid' dtobj = dateparser.parse(r.attrib['timestamp']) info['last'] = dtobj # unknown usage... but likely loads info['obs'] = 0 changes.append((dtobj, base + r.attrib['revid'], info)) if changes: changes.sort() changes[-1][-1]['last'] = 'now' return changes
def fetch_changes(self, req_url, dt=None): #self.fetch_memento( req, requri, dt ) changes = [] if req_url.startswith("//"): req_url = req_url.replace("//", "http://") #valid = re.compile('^(http://|https://)(.+)') #match = valid.match(requri) #defaultProtocol = "http://" dtfmstr = "%Y%m%d%H%M%S" parsed_url = urlparse.urlparse(req_url) headers = self.hdrs headers['Host'] = parsed_url[1] dt_next = False if dt is None: nowd = now() current = dateparser.parse(nowd) dt = current.strftime(dtfmstr) else: dt_del = timedelta(seconds=1) dt_next = dt + dt_del dt_next = dt_next.strftime(dtfmstr) dt = dt.strftime(dtfmstr) title = None api_base_url = None try: title = urlparse.parse_qs(parsed_url[4]).get('title') except: pass try: dom = self.get_xml(req_url, headers=headers, html=True) except Exception as e: return links = dom.xpath("//link") for link in links: if link.attrib['rel'].lower() == "edituri": api_base_url = link.attrib['href'].split("?")[0] if api_base_url.startswith("//"): api_base_url = api_base_url.replace("//", "http://") if not title: url_parts = req_url.split("/") title = url_parts[len(url_parts) - 1].split("?")[0] # with extra info url = "%s?format=xml&action=query&prop=revisions&meta=siteinfo&rvprop=timestamp|ids|user&rvlimit=5000&redirects=1&titles=%s"\ % (api_base_url, title) base = "%s?title=%s&oldid=" % (api_base_url.replace("api.php", "index.php"), title) dom = self.get_xml(url) dtobj = None while dom is not None: revs = dom.xpath('//rev') for r in revs: info = {} try: info['dcterms:creator'] = '%s/wiki/User:%s' % \ (api_base_url, r.attrib['user']) except: pass info['type'] = 'valid' info['last'] = dtobj dtobj = dateparser.parse(r.attrib['timestamp']) # unknown usage... but likely loads info['obs'] = 0 changes.append((dtobj, base + r.attrib['revid'], info)) cont = dom.xpath('/api/query-continue/revisions/@rvstartid') if cont: dom = self.get_xml(url + "&rvstartid=" + cont[0]) else: dom = None if changes: changes.sort() changes[-1][-1]['last'] = 'now' return changes