예제 #1
0
def application(env, start_response):
    """
    WSGI entry point.
    :param env: the environment variables from the http server.
    :param start_response: the function that will trigger the response.
    :return: the response.
    """

    req_path = env.get("REQUEST_URI", "/")
    req_datetime = env.get("HTTP_ACCEPT_DATETIME")
    accept_datetime = dateparser.parse(now())
    if req_datetime:
        try:
            accept_datetime = dateparser.parse(req_datetime)
            if accept_datetime.tzinfo is None or \
                            accept_datetime.tzinfo.utcoffset(accept_datetime) is None:
                # Naive date. Reparse with Timezone
                req_datetime += " GMT"
                accept_datetime = dateparser.parse(req_datetime)
        except Exception as e:
            accept_datetime = None

    if not req_path.startswith("/"):
        req_path = "/" + req_path

    mem_proxy = MementoProxy()
    if mem_proxy.path:
        req_path = req_path.replace(mem_proxy.path, "")

    req_proxy = req_path.split("/")[0]

    if req_proxy in mem_proxy.proxies:
        module_path = "proxy." + mem_proxy.proxies.get(req_proxy)
        module = importlib.import_module(module_path)
        class_str = mem_proxy.proxies.get(req_proxy)
        class_str = class_str[0].upper() + class_str[1:]
        proxy_class = getattr(module, class_str)

        proxy = proxy_class(env, start_response)
        proxy.proxy_part = req_proxy
        req_serv = req_path.replace(req_proxy + "/", "", 1)
        if req_serv.startswith(mem_proxy.timegate_url_part):
            req_url = req_serv.replace(mem_proxy.timegate_url_part, "", 1)[1:]
            req_url = process_req_url(req_url, proxy)
            if req_proxy.find("wiki") >= 0:
                return proxy.handle_timegate(req_url,
                                             accept_datetime,
                                             wiki=True)
            else:
                return proxy.handle_timegate(req_url, accept_datetime)
        elif req_serv.startswith(mem_proxy.timemap_url_part):
            req_url = req_serv.replace(mem_proxy.timemap_url_part, "", 1)[1:]
            req_url = req_url.replace(mem_proxy.timemap_link_url_part, "",
                                      1)[1:]
            req_url = process_req_url(req_url, proxy)
            return proxy.handle_timemap(req_url)

    start_response("404 Not Found", [('Content-Type', 'text/html')])
    return ["Requested resource not found."]
예제 #2
0
def application(env, start_response):
    """
    WSGI entry point.
    :param env: the environment variables from the http server.
    :param start_response: the function that will trigger the response.
    :return: the response.
    """

    req_path = env.get("REQUEST_URI", "/")
    req_datetime = env.get("HTTP_ACCEPT_DATETIME")
    accept_datetime = dateparser.parse(now())
    if req_datetime:
        try:
            accept_datetime = dateparser.parse(req_datetime)
            if accept_datetime.tzinfo is None or \
                            accept_datetime.tzinfo.utcoffset(accept_datetime) is None:
                # Naive date. Reparse with Timezone
                req_datetime += " GMT"
                accept_datetime = dateparser.parse(req_datetime)
        except Exception as e:
            accept_datetime = None

    if not req_path.startswith("/"):
        req_path = "/" + req_path

    mem_proxy = MementoProxy()
    if mem_proxy.path:
        req_path = req_path.replace(mem_proxy.path, "")

    req_proxy = req_path.split("/")[0]

    if req_proxy in mem_proxy.proxies:
        module_path = "proxy." + mem_proxy.proxies.get(req_proxy)
        module = importlib.import_module(module_path)
        class_str = mem_proxy.proxies.get(req_proxy)
        class_str = class_str[0].upper() + class_str[1:]
        proxy_class = getattr(module, class_str)

        proxy = proxy_class(env, start_response)
        proxy.proxy_part = req_proxy
        req_serv = req_path.replace(req_proxy + "/", "", 1)
        if req_serv.startswith(mem_proxy.timegate_url_part):
            req_url = req_serv.replace(mem_proxy.timegate_url_part, "", 1)[1:]
            req_url = process_req_url(req_url, proxy)
            if req_proxy.find("wiki") >= 0:
                return proxy.handle_timegate(req_url, accept_datetime, wiki=True)
            else:
                return proxy.handle_timegate(req_url, accept_datetime)
        elif req_serv.startswith(mem_proxy.timemap_url_part):
            req_url = req_serv.replace(mem_proxy.timemap_url_part, "", 1)[1:]
            req_url = req_url.replace(mem_proxy.timemap_link_url_part, "", 1)[1:]
            req_url = process_req_url(req_url, proxy)
            return proxy.handle_timemap(req_url)

    start_response("404 Not Found", [('Content-Type', 'text/html')])
    return ["Requested resource not found."]
예제 #3
0
    def fetch_memento(self, req_url, dt=None):
        changes = []
        valid = re.compile('^(http://|https://)(.+.wikipedia.org)')
        match = valid.match(req_url)
        default_protocol = "http://"

        dtfmstr = "%Y%m%d%H%M%S"

        if match is None:
            return

        if not dt:
            dt = dateparser.parse(now())

        dt_del = timedelta(seconds=1)
        dt_next = dt + dt_del
        dt_next = dt_next.strftime(dtfmstr)
        dt = dt.strftime(dtfmstr)

        title_index = string.find(req_url, '/wiki/')
        title = req_url[title_index + 6:]

        url_list = []

        # url for getting the memento, prev
        mem_prev = "%s%s/w/api.php?format=xml&action=query&prop=revisions&rvprop=timestamp|ids|user&rvlimit=2&redirects=1&titles=%s&rvdir=older&rvstart=%s" % \
                   (default_protocol, match.groups()[1], title, dt)
        url_list.append('mem_prev')

        # url for next
        if dt_next:
            next = "%s%s/w/api.php?format=xml&action=query&prop=revisions&rvprop=timestamp|ids|user&rvlimit=2&redirects=1&titles=%s&rvdir=newer&rvstart=%s" % \
                   (default_protocol, match.groups()[1], title, dt)
            url_list.append('next')

        # url for last
        last = "%s%s/w/api.php?format=xml&action=query&prop=revisions&rvprop=timestamp|ids|user&rvlimit=1&redirects=1&titles=%s" % \
               (default_protocol, match.groups()[1], title)
        url_list.append('last')

        # url for first
        first = "%s%s/w/api.php?format=xml&action=query&prop=revisions&rvprop=timestamp|ids|user&rvlimit=1&redirects=1&rvdir=newer&titles=%s" % \
                (default_protocol, match.groups()[1], title)
        url_list.append('first')

        base = "%s%s/w/index.php?title=%s&oldid=" % (default_protocol, match.groups()[1], title)

        for url in url_list:
            revs = []
            dom = self.get_xml(vars()[url])
            try:
                revs = dom.xpath('//rev')
            except Exception:
                return

            for r in revs:
                info = {}
                try:
                    info['dcterms:creator'] = '%s%s/wiki/User:%s' % \
                                              (default_protocol, match.groups()[1], r.attrib['user'])
                except Exception:
                    pass
                info['type'] = 'valid'
                dtobj = dateparser.parse(r.attrib['timestamp'])
                info['last'] = dtobj
                # unknown usage... but likely loads
                info['obs'] = 0
                changes.append((dtobj, base + r.attrib['revid'], info))                
            
        if changes:
            changes.sort()
            changes[-1][-1]['last'] = 'now'
        return changes
예제 #4
0
    def fetch_memento(self, req_url, dt=None):
        p = urlparse(req_url)
        host = p[1]
        upath = p[2]

        if host.find('.wikia.com') == -1 and not host in self.hosts:
            return

        (pref, title) = upath.rsplit('/', 1)
        if pref:
            # look for /wiki
            pref = pref.replace('/wiki', '')
        
        changes = []
        defaultProtocol = "http://"

        dtfmstr = "%Y%m%d%H%M%S"

        dt_next = False
        if dt is None:
            nowd = now()    
            current = dateparser.parse(nowd)
            dt = current.strftime(dtfmstr)
        else:
            dt_del = timedelta(seconds=1)
            dt_next = dt + dt_del
            dt_next = dt_next.strftime(dtfmstr)
            dt = dt.strftime(dtfmstr)

        url_list = []

        # url for getting the memento, prev
        mem_prev = "%s%s/api.php?format=xml&action=query&prop=revisions&rvprop=timestamp|ids|user&rvlimit=2&redirects=1&titles=%s&rvdir=older&rvstart=%s" % (defaultProtocol, host, title, dt)
        url_list.append('mem_prev')

        # url for next
        if dt_next:
            next = "%s%s/api.php?format=xml&action=query&prop=revisions&rvprop=timestamp|ids|user&rvlimit=2&redirects=1&titles=%s&rvdir=newer&rvstart=%s" % (defaultProtocol, host, title, dt)
            url_list.append('next')

        # url for last
        last = "%s%s/api.php?format=xml&action=query&prop=revisions&rvprop=timestamp|ids|user&rvlimit=1&redirects=1&titles=%s" % (defaultProtocol, host, title)
        url_list.append('last')

        # url for first
        first = "%s%s/api.php?format=xml&action=query&prop=revisions&rvprop=timestamp|ids|user&rvlimit=1&redirects=1&rvdir=newer&titles=%s" % (defaultProtocol, host, title)
        url_list.append('first')


        #url = url % (title, dt)
        base = "%s%s%s/index.php?title=%s&oldid=" % \
               (defaultProtocol, host, pref, title)
        dtobj = None

        hdrs = self.hdrs
        hdrs['Host'] = host

        for url in url_list:
            
            dom = self.get_xml(vars()[url], headers=hdrs)
            revs = dom.xpath('//rev')
            for r in revs:
                info = {}
                try:
                    info['dcterms:creator'] = '%s%s%s/wiki/User:%s' %\
                                              (defaultProtocol, host,
                                               pref, r.attrib['user'])
                except:
                    pass
                info['type'] = 'valid'
                dtobj = dateparser.parse(r.attrib['timestamp'])
                info['last'] = dtobj
                # unknown usage... but likely loads
                info['obs'] = 0
                changes.append((dtobj, base + r.attrib['revid'], info))                
            
        if changes:
            changes.sort()
            changes[-1][-1]['last'] = 'now'
        return changes
예제 #5
0
    def fetch_memento(self, req_url, dt=None):
        changes = []
        if req_url.startswith("//"):
            req_url = req_url.replace("//", "http://")

        #valid = re.compile('^(http://|https://)(.+)')
        #match = valid.match(requri)
        #defaultProtocol = "http://"

        dtfmstr = "%Y%m%d%H%M%S"

        parsed_url = urlparse.urlparse(req_url)

        headers = self.hdrs
        headers['Host'] = parsed_url[1]

        dt_next = False
        if dt is None:
            nowd = now()    
            current = dateparser.parse(nowd)
            dt = current.strftime(dtfmstr)
        else:
            dt_del = timedelta(seconds=1)
            dt_next = dt + dt_del
            dt_next = dt_next.strftime(dtfmstr)
            dt = dt.strftime(dtfmstr)

        title = None
        api_base_url = None
        try:
            title = urlparse.parse_qs(parsed_url[4]).get('title')
        except:
            pass
        
        try:
            dom = self.get_xml(req_url, headers=headers, html=True)
        except Exception as e:
            return

        links = dom.xpath("//link")
        for link in links:
            if link.attrib['rel'].lower() == "edituri":
                api_base_url = link.attrib['href'].split("?")[0]
                if api_base_url.startswith("//"):
                    api_base_url = api_base_url.replace("//", "http://")

        if not title:
            url_parts = req_url.split("/")
            title = url_parts[len(url_parts) - 1].split("?")[0]

        url_list = []

        # url for getting the memento, prev
        mem_prev = "%s?format=xml&action=query&prop=revisions&rvprop=timestamp|ids|user&rvlimit=2&redirects=1&titles=%s&rvdir=older&rvstart=%s" % (api_base_url, title, dt)
        url_list.append('mem_prev')

        # url for next
        if dt_next:
            next = "%s?format=xml&action=query&prop=revisions&rvprop=timestamp|ids|user&rvlimit=2&redirects=1&titles=%s&rvdir=newer&rvstart=%s" % (api_base_url, title, dt)
            url_list.append('next')

        # url for last
        last = "%s?format=xml&action=query&prop=revisions&rvprop=timestamp|ids|user&rvlimit=1&redirects=1&titles=%s" % (api_base_url, title)
        url_list.append('last')

        # url for first
        first = "%s?format=xml&action=query&prop=revisions&rvprop=timestamp|ids|user&rvlimit=1&redirects=1&rvdir=newer&titles=%s" % (api_base_url, title)
        url_list.append('first')

        base = "%s?title=%s&oldid=" % (api_base_url.replace("api.php", "index.php"), title)
        dtobj = None

        for url in url_list:
            dom = None
            try:
                dom = self.get_xml(vars()[url])
            except Exception as e:
                pass
            if not dom:
                continue
            dom = dom.getroot()
            revs = dom.xpath('//rev')
            for r in revs:
                info = {}
                try:
                    info['dcterms:creator'] = '%s/wiki/User:%s' %\
                                              (api_base_url, r.attrib['user'])
                except:
                    pass
                info['type'] = 'valid'
                dtobj = dateparser.parse(r.attrib['timestamp'])
                info['last'] = dtobj
                # unknown usage... but likely loads
                info['obs'] = 0
                changes.append((dtobj, base + r.attrib['revid'], info))                
            
        if changes:
            changes.sort()
            changes[-1][-1]['last'] = 'now'
        return changes
예제 #6
0
    def fetch_changes(self, req_url, dt=None):
        #self.fetch_memento( req, requri, dt )
        changes = []
        if req_url.startswith("//"):
            req_url = req_url.replace("//", "http://")

        #valid = re.compile('^(http://|https://)(.+)')
        #match = valid.match(requri)
        #defaultProtocol = "http://"

        dtfmstr = "%Y%m%d%H%M%S"

        parsed_url = urlparse.urlparse(req_url)

        headers = self.hdrs
        headers['Host'] = parsed_url[1]

        dt_next = False
        if dt is None:
            nowd = now()
            current = dateparser.parse(nowd)
            dt = current.strftime(dtfmstr)
        else:
            dt_del = timedelta(seconds=1)
            dt_next = dt + dt_del
            dt_next = dt_next.strftime(dtfmstr)
            dt = dt.strftime(dtfmstr)

        title = None
        api_base_url = None
        try:
            title = urlparse.parse_qs(parsed_url[4]).get('title')
        except:
            pass

        try:
            dom = self.get_xml(req_url, headers=headers, html=True)
        except Exception as e:
            return

        links = dom.xpath("//link")
        for link in links:
            if link.attrib['rel'].lower() == "edituri":
                api_base_url = link.attrib['href'].split("?")[0]
                if api_base_url.startswith("//"):
                    api_base_url = api_base_url.replace("//", "http://")

        if not title:
            url_parts = req_url.split("/")
            title = url_parts[len(url_parts) - 1].split("?")[0]

        # with extra info
        url = "%s?format=xml&action=query&prop=revisions&meta=siteinfo&rvprop=timestamp|ids|user&rvlimit=5000&redirects=1&titles=%s"\
              % (api_base_url, title)

        base = "%s?title=%s&oldid=" % (api_base_url.replace("api.php", "index.php"), title)
        dom = self.get_xml(url)
        dtobj = None
        while dom is not None:
            revs = dom.xpath('//rev')
            for r in revs:
                info = {}
                try:
                    info['dcterms:creator'] = '%s/wiki/User:%s' % \
                                              (api_base_url, r.attrib['user'])
                except:
                    pass
                info['type'] = 'valid'
                info['last'] = dtobj
                dtobj = dateparser.parse(r.attrib['timestamp'])
                # unknown usage... but likely loads
                info['obs'] = 0
                changes.append((dtobj, base + r.attrib['revid'], info))
            cont = dom.xpath('/api/query-continue/revisions/@rvstartid')
            if cont:
                dom = self.get_xml(url + "&rvstartid=" + cont[0])
            else:
                dom = None
                
        if changes:
            changes.sort()
            changes[-1][-1]['last'] = 'now'
        return changes