def get_xml(self, uri, html=False): """Retrieve the resource using the url. It parses response as XML or HTML and returns the parsed DOM object. :param uri: [str] The uri to retrieve. :param headers: [dict(header_name: value)] Optional HTTP headers to send in the request. :param html: [bool] Optional flag to parse the response as HTML. :return: [lxml_obj] Parsed DOM. """ try: page = self.request(uri) except HandlerError as he: raise HandlerError(he, status=404) try: page_data = page.content if not html: parser = etree.XMLParser(recover=True) else: parser = etree.HTMLParser(recover=True) return etree.parse(StringIO.StringIO(page_data), parser) except Exception as e: logging.error("Cannot parse XML/HTML from %s" % uri) raise HandlerError("Couldn't parse data from %s" % uri, 404)
def get_memento(self, req_uri, accept_datetime): logging.debug("Begin Fetching mementos for: %s" % req_uri) p = urlparse.urlparse(req_uri) host = p[1] for h in self.hosts: if host.find(h) == -1: return timestamp = date_str(accept_datetime, self.TIMESTAMPFMT) params = { 'rvlimit': 1, # Only need one 'rvstart': timestamp, # Start listing from here 'rvdir': 'older' # List in decreasing order } # Finds the API and title using scraping api_base_uri = None try: dom = self.get_xml(req_uri, html=True) links = dom.xpath("//link") for link in links: if link.attrib['rel'].lower() == "edituri": api_base_uri = link.attrib['href'].split("?")[0] if api_base_uri.startswith("//"): api_base_uri = api_base_uri.replace("//", "http://") parsed_url = urlparse.urlparse(req_uri) try: title = urlparse.parse_qs(parsed_url[4])['title'][0] except Exception as e: title = parsed_url.path.split('/')[-1] logging.debug( "Orain handler: API found: %s, page title parsed to: %s " % (api_base_uri, title)) if not title: raise HandlerError("Cannot find Title", 404) if not api_base_uri: raise HandlerError("Cannot find orain API on page", 404) else: title = urllib2.unquote(title) except HandlerError as he: raise he except Exception as e: logging.error( "OrainHandler: querying and parsing page for title/api %s. handler will return empty response" % e) return None base_uri = api_base_uri.replace("api.php", "index.php") return self.query(req_uri, params, title, api_base_uri, base_uri)
def get_all_mementos(self, req_url): # implement the changes list for this particular proxy parameters = {} parameters['q'] = req_url parameters['subject'] = 'url' uri = baseuri + urllib.urlencode(parameters) try: jsonobj = self.request(uri).json() except Exception as e: logging.error("Cannot request API or parse json response: " + e) raise HandlerError("Cannot get API response.", 404) changes = [] if int(jsonobj['availableHits']) == 0: return [] tmid = jsonobj['hits'][0]['ID'] tmuri = "http://haw.nsk.hr/publikacija/" + tmid try: data = self.request(tmuri).content except Exception as e: logging.error("Error requerying API: " + e) raise HandlerError("Cannot get API response.", 404) uriRegex = re.compile(r'<tr><td>[\d]*\.</td>.*</tr>') dtregex = re.compile('<td>\d\d\.\d\d\.\d\d\d\d[0-9\.:\s]*</td>') uris = re.findall(uriRegex, data) for u in uris: d = u.index("title") loc = "http://haw.nsk.hr/" + u[45:d - 2].lstrip('/') result = dtregex.search(u) if result: dtstr = result.group(0) dtstr = dtstr[4:-5] dtstr = dtstr[6:10] + dtstr[3:5] + dtstr[0:2] + \ dtstr[11:19].replace(":", "") + " GMT" changes.append((loc, dtstr)) return changes
def get_all_mementos(self, requri): if requri == 'http://lanlsource.lanl.gov/hello': wcurl = 'http://webcitation.org/5jq247bmx' elif requri == 'http://lanlsource.lanl.gov/pics/picoftheday.png': wcurl = 'http://webcitation.org/5jq24MRo3' elif requri == 'http://odusource.cs.odu.edu/pics/picoftheday.png': wcurl = 'http://webcitation.org/5k9j4oXPw' else: return self.get_from_xml(requri) # Cleaner but much slower # wcurl = 'http://webcitation.org/query.php?url=' + requri # Fast # screen scraping txheaders = {} try: req = urllib2.Request(wcurl, None, txheaders) fh = urllib2.urlopen(req) fh.close() req = urllib2.Request('http://webcitation.org/topframe.php') fh = urllib2.urlopen(req) data = fh.read() fh.close() except Exception as e: raise HandlerError('Cannot request page', 404) changes = [] try: parser = etree.HTMLParser() dom = etree.parse(StringIO.StringIO(data), parser) except: raise HandlerError('Cannot parse HTML') opts = dom.xpath('//select[@name="id"]/option') for o in opts: fid = o.attrib['value'] date = o.text if date.find('(failed)') > -1: continue changes.append(('http://webcitation.org/query?id=' + fid, date)) return changes
def get_all_mementos(self, uri_r): try: # Extract the resource ID match = self.rex.match(uri_r) if not match: raise HandlerError("URI does not match a valid resource.", 404) parts = match.groups() base = parts[0] type = parts[1] resource = parts[2] normalized_uri = '%s/%s/%s' % (base, type, resource) # Prepars the API call params = { 'verb': 'GetRecord', 'identifier': 'oai:arXiv.org:%s' % resource, 'metadataPrefix': 'arXivRaw' } # Queries the API and extract the values response = self.request(self.api_base, params=params) if not response: raise HandlerError("API response not 2XX", 404) root = etree.parse(StringIO(response.content), etree.XMLParser(recover=True)) versions = root.findall( './/{http://arxiv.org/OAI/arXivRaw/}version') # Processes the return def mapper(version): v = version.xpath('@*')[0] date = version.find( './{http://arxiv.org/OAI/arXivRaw/}date').text return (normalized_uri + v, date) return map(mapper, versions) except HandlerError as he: raise he except Exception as e: logging.error('Arxiv handler exception: %s returning 404' % e) return
def get_memento(self, uri_r, req_datetime): uri_r = uri_r + '/' # Check if the URI is one archived website matches = [x for x in self.pages_list if uri_r.startswith(x[0])] if len(matches) == 0: raise HandlerError( "Pastpages does not have archives of that website.", 404) if len(matches) > 1: logging.error("Uri conflict in pastpages' API URI list.") raise HandlerError("Error in pastpages API") site_slug = matches[0][1] params = { 'limit': 1, 'site__slug': site_slug, 'timestamp__lte': req_datetime.strftime(self.API_TIMEFMT) } request = '/api/beta/screenshots/' json_response = self.request(self.BASE + request, params=params).json() if 'error' in json_response: logging.error("Error in pastpages response: " + str(json_response['error'])) return result_list = [ # 'objects' is the list of responses # 'objects.absolute_url' is the URI. It exists if 'objects.has_image' (self.BASE + obj['absolute_url'], obj['timestamp']) for obj in json_response['objects'] ] if result_list: if len(result_list) > 1: logging.error( "API returned more than one object. returning the first") return result_list[0] # No Memento Found, Trying the first else: return
def get_xml(self, uri, html=False): page = self.request(uri) try: page_data = page.content if not html: parser = etree.XMLParser(recover=True) else: parser = etree.HTMLParser(recover=True) return etree.parse(StringIO.StringIO(page_data), parser) except Exception as e: logging.error("Cannot parse XML/HTML from %s" % uri) raise HandlerError("Couldn't parse data from %s" % uri, 404)
def get_all_mementos(self, uri_r): # WILL BE TOO SLOW. TOO MANY WEBSITES' # Deactivate TimeMaps logging.warning( "Get_all_mementos used: Pastpages will probably have too big timemaps. Expect Timeouts" ) matches = [x for x in self.pages_list if uri_r.startswith(x[0])] if len(matches) == 0: raise HandlerError( "Pastpages does not have archives of that website.", 404) if len(matches) > 1: logging.error("Uri conflict in pastpages' API URI list.") raise HandlerError("Error in pastpages API") site_slug = matches[0][1] params = {'limit': self.LIMIT_MAX, 'site__slug': site_slug} request = '/api/beta/screenshots/' has_next = True image_list = [] # Keep while there are still result pages while has_next: json_response = self.request(self.BASE + request, params=params).json() image_list.extend([ # 'objects' is the list of responses # 'objects.image' is the URI of the memento. It exists if 'objects.has_image' (self.BASE + obj['absolute_url'], obj['timestamp']) for obj in json_response['objects'] if obj['has_image'] ]) request = json_response['meta']['next'] params = None # the request already contains &limit and &offset # Each response has a non null 'meta.next' value if it has a # continuation has_next = request is not None return image_list
def get_all_mementos(self, uri): MAX_TIME = 120 #seconds match_spec_name = self.re_spec_name.match(uri) if not bool(match_spec_name): raise HandlerError( "Unknown W3C specification uri. \n" + ACCEPTABLE_RESOURCE, 404) spec_name = match_spec_name.groups()[1] if spec_name.endswith("/"): spec_name = spec_name[:-1] api_response = self.request(self.api_url % (spec_name, APIKEY)) if not api_response.status_code == 200: raise HandlerError( "No versions were found for the requested specification with shortname: %s" % spec_name, 404) json_response = {} try: json_response = api_response.json() #for versions in json_response.get("_embedded").get("versions"): # spec_versions.append((versions.get("uri"), versions.get("date"))) except: raise HandlerError("The W3C API returned an unknown response.", 502) if not json_response.get("_embedded") and json_response.get( "_embedded").get("versions"): raise HandlerError("The W3C API returned an unknown response.", 502) versions = map( lambda version: (version.get("uri"), version.get("date")), json_response.get("_embedded").get("version-history")) #return versions return sorted(versions, key=lambda version: version[1])
def get_memento(self, uri_r, req_datetime): # Suppose you have a special rule for certain dates if req_datetime.year < 1999: # In this case, we do not serve anything before 2001 # Return a custom Error to the client raise HandlerError("Cannot server a Memento before 1999", status=404) else: # Gets all mementos for this URI mementos_list = self.get_all_mementos(uri_r) # Find the best single memento is returned for this uri_r and this # date (uri_m, date_time) = mementos_list[-1] # In this example we take the last one return (uri_m, date_time) # The return value is a tuple here.
def get_all_mementos(self, req_url): # implement the changes list for this particular proxy uri = BASEURI + req_url try: resp = self.request(uri) data = resp.content except Exception as e: logging.error("Cannot request URI: %s" % e) raise HandlerError("Cannot request URI", 404) changes = [] uris = re.findall(self.uriRegex, data) for u in uris: dtstr = u[0] loc = u[1] dtstr += " GMT" changes.append((loc, dtstr)) return changes
def get_from_xml(self, requri): api_request = 'http://webcitation.org/query.php?returnxml=1&url=' + requri xml = self.request(api_request, timeout=120) try: parser = etree.XMLParser(recover=True) # Parses bad XML dom = etree.parse(StringIO.StringIO(str(xml.text)), parser) except Exception as e: logging.error('Cannot parse XML: ' + str(e)) raise HandlerError('Cannot parse XML', 404) results = [] succes = dom.xpath("//result[@status='success']") for s in succes: url = s.find('webcite_url').text date = s.find('timestamp').text results.append((url, date)) return results
def get_xml(self, uri, params=None, html=False): """Retrieves the resource using the url, parses it as XML or HTML and returns the parsed dom object. :param uri: [str] The uri to retrieve :param headers: [dict(header_name: value)] optional http headers to send in the request :param html: [bool] optional flag to parse the response as HTML :return: [lxml_obj] parsed dom. """ page = self.request(uri, params=params) try: page_data = page.content if not html: parser = etree.XMLParser(recover=True) else: parser = etree.HTMLParser(recover=True) return etree.parse(StringIO.StringIO(page_data), parser) except Exception: logging.error("Cannot parse XML/HTML from %s" % uri) raise HandlerError("Couldn't parse data from %s" % uri, 404)
def get_all_mementos(self, uri): MAX_TIME = 120 # seconds # URI deconstruction match = self.rex.match(uri) if not bool(match): raise HandlerError("Github uri does not match a valid resource. \n" + ACCEPTABLE_RESOURCE, 404) protocol = match.groups()[0] base = match.groups()[1] user = match.groups()[2] repo = match.groups()[3] req_path = match.groups()[4] path = '' branch = '' # Processes one result to (memento, datetime) pair mapper = None # Defining Resource type and response handling # Creates one function for a specific type to map the results to # memento pairs. if 1: # Resource is a repository if not req_path or req_path == '/': if req_path: path = '/' def make_pair(commit): memento_path = '/commit/%s' % commit['id'] uri_m = '%s%s/%s/%s%s' % ( protocol, base, user, repo, memento_path) return (uri_m, commit['created_at']) mapper = make_pair # Resource is a file elif req_path.startswith('/blob/'): path = req_path.replace('/blob/', '', 1) branch_index = path.find('/') branch = path[:branch_index] path = path[branch_index:] if branch == '' or path == '' or path.endswith('/'): raise HandlerError( "Not found. Empty path for file in repository", 404) def make_pair(commit): # HTML Resource memento_path = '/blob/%s%s' % (commit['id'], path) uri_m = '%s%s/%s/%s%s' % ( protocol, base, user, repo, memento_path) return (uri_m, commit['created_at']) mapper = make_pair # Resource is a raw file elif req_path.startswith('/raw/'): path = req_path.replace('/raw/', '', 1) branch_index = path.find('/') branch = path[:branch_index] path = path[branch_index:] is_online = bool(requests.head( uri, params={'private_token': self.apikey})) if path == '' or path.endswith('/') or not is_online: raise HandlerError( "'%s' not found: Raw resource must be a file." % path, 404) def make_pair(commit): # HTML Resource memento_path = '/raw/%s%s' % (commit['id'], path) uri_m = '%s%s/%s/%s%s' % ( protocol, base, user, repo, memento_path) return (uri_m, commit['created_at']) mapper = make_pair # Resource is a directory elif req_path.startswith('/tree/'): path = req_path.replace('/tree/', '', 1) branch_index = path.find('/') if branch_index < 0: branch_index = len(path) branch = path[:branch_index] path = path[branch_index:] if branch == '': raise HandlerError("Not found. Empty branch path", 404) def make_pair(commit): memento_path = '/commit/%s' % commit['id'] uri_m = '%s%s/%s/%s%s' % ( protocol, base, user, repo, memento_path) return (uri_m, commit['created_at']) mapper = make_pair # Resource is a wiki entry # e.g. # https://gitlab.example.com/opac/cdrom-opac/wikis/home --> # https://gitlab.example.com/opac/cdrom-opac/wikis/home?version_id=b4a9027e2948a5ce9ecd3a9c1641ed958b9f7728 # API does not seem to support this: getting wrong commit IDs # elif req_path.startswith('/wikis/'): # def make_pair(commit): # # HTML Resource # memento_path = '%s?version_id=%s' % (req_path, commit['id']) # uri_m = '%s%s/%s/%s%s' % ( # protocol, base, user, repo, memento_path) # return (uri_m, commit['created_at']) # mapper = make_pair if mapper is None: # The resource is not accepcted. raise HandlerError( "GitLab resource type not found." + ACCEPTABLE_RESOURCE, 404) # Initiating request variables # It appears that user/repo can be used instead of a numeric project # ID. %2f is a urlencoded slash (/). apibase = '%s/projects/%s/repository/commits' % ( self.api, user + '%2f' + repo) params = { 'per_page': 100, # Max allowed is 100 'path': str(path), 'branches': str(branch), 'private_token': self.apikey } aut_pair = ('MementoTimegate', 'LANLTimeGate14') cont = apibase # The first continue is the beginning # Does sequential queries to get all commits of the particular resource queries_results = [] tmax = int(time.time()) + MAX_TIME while cont is not None: if int(time.time()) > tmax: raise HandlerError( "Resource too big to be served. GitLab Handler TimeOut (timeout: %d seconds)" % MAX_TIME, 502) req = self.request(cont, params=params, auth=aut_pair) cont = None if not req: # status code different than 2XX raise HandlerError( "Cannot find resource on version server. API response %d'd " % req.status_code, 404) result = req.json() if 'message' in result: # API-specific error raise HandlerError(result['message']) if 'errors' in result: # API-specific error raise HandlerError(result['errors']) if len(result) > 0: # The request was successful queries_results += result # Search for possible continue if 'link' in req.headers: link_header = req.headers['link'] headermatch = self.header_rex.search(link_header) if bool(headermatch): # The response was truncated, the rest can be obtained using # the given "next" link cont = headermatch.groups()[0] if queries_results: # Processes results based on resource type return map(mapper, queries_results) else: # No results found raise HandlerError( "Resource not found, empty response from API", 404)
def query(self, req_uri, req_params, title, api_base_uri, base_uri): params = { 'action': 'query', 'format': 'json', 'prop': 'revisions', 'rvprop': 'ids|timestamp', 'indexpageids': '', 'titles': title } params.update(req_params) # Does sequential queries to get all revisions IDs and Timestamps queries_results = [] condition = True while condition: # Clone original request newparams = params.copy() req = self.request(api_base_uri, params=newparams) try: result = req.json() except Exception as e: logging.error("No JSON can be decoded from API %s" % api_base_uri) raise HandlerError("No API answer.", 404) if 'error' in result: raise HandlerError(result['error']) if 'warnings' in result: # logging.warn(result['warnings']) pass try: # The request was successful # the JSON key of the page (only one) pid = result['query']['pageids'][0] queries_results += result['query']['pages'][pid]['revisions'] if ('missing' in result['query']['pages'][pid] or 'invalid' in result['query']['pages'][pid]): raise HandlerError( "Cannot find resource on version server.", 404) except Exception as e: if req_params['rvdir'] == 'older': req_params['rvdir'] = 'newer' return self.query( req_uri, req_params, title, api_base_uri, base_uri) else: raise HandlerError("No revision returned from API.", 404) if 'continue' in result: # The response was truncated, the rest can be obtained using # &rvcontinue=ID cont = result['continue'] # Modify it with the values returned in the 'continue' section # of the last result. newparams.update(cont) condition = True else: condition = False # Processing list def f(rev): rev_uri = base_uri + '?title=%s&oldid=%d' % ( urllib2.quote(title), rev['revid']) dt = rev['timestamp'] return (rev_uri, dt) # logging.debug("Returning API results of size %d" % len(queries_results)) return map(f, queries_results)
def get_memento(self, req_uri, accept_datetime): timestamp = date_str(accept_datetime, self.TIMESTAMPFMT) params = { 'rvlimit': 1, # Only need one 'rvstart': timestamp, # Start listing from here 'rvdir': 'older' # List in decreasing order } # Finds the API and title using scraping api_base_uri = None try: dom = self.get_xml(req_uri, html=True) links = dom.xpath("//link") for link in links: if link.attrib['rel'].lower() == "edituri": api_base_uri = link.attrib['href'].split("?")[0] if api_base_uri.startswith("//"): api_base_uri = api_base_uri.replace("//", "http://") parsed_url = urlparse.urlparse(req_uri) try: title = urlparse.parse_qs(parsed_url[4])['title'][0] except Exception as e: title = parsed_url.path.split('/')[-1] logging.debug( "Mediawiki handler: API found: %s, page title parsed to: %s " % (api_base_uri, title)) if not title: raise HandlerError("Cannot find Title", 404) if not api_base_uri: raise HandlerError("Cannot find mediawiki API on page", 404) else: title = urllib2.unquote(title) except HandlerError as he: raise he except Exception as e: logging.error( "MediaWikiHandler: querying and parsing page for title/api %s." " Handler will return empty response." % e) return None base_uri = api_base_uri.replace("api.php", "index.php") # The best Memento memento = self.query(req_uri, params, title, api_base_uri, base_uri)[0] # The first Memento if title in self.inner_cache and memento: logging.debug("Wiki Handler: found cached first for " + title) first = self.inner_cache[title] else: logging.debug("Wiki Handler: Querying first for " + title) first_params = { 'rvlimit': 1, # Only need one 'rvstart': '19900101000000', # Start listing from 1990 'rvdir': 'newer' # List in increasing order } first = self.query(req_uri, first_params, title, api_base_uri, base_uri)[0] if len(self.inner_cache) > self.max_inner_cache_size: self.inner_cache = {} self.inner_cache[title] = first # This handler returns more than only the best Memento. # A Link with rel="first memento" will also be returned to the client. return [first, memento]
def get_all_mementos(self, uri): MAX_TIME = 120 # seconds if uri.startswith('http://'): uri = uri.replace('http://', 'https://', 1) # URI deconstruction match = self.rex.match(uri) if not bool(match): raise HandlerError( "Github uri does not match a valid resource. \n" + ACCEPTABLE_RESOURCE, 404) protocol = match.groups()[0] base = match.groups()[1] user = match.groups()[2] repo = match.groups()[3] req_path = match.groups()[4] path = '' branch = '' # Processes one result to (memento, datetime) pair mapper = None # Defining Resource type and response handling # Creates one function for a specific type to map the results to # memento pairs. if base == 'github.com/': # Resource is a repository if not req_path or req_path == '/': if req_path: path = '/' def make_pair(commit): return (commit['html_url'].replace('commit', 'tree'), commit['commit']['committer']['date']) mapper = make_pair # Resource is a file elif req_path.startswith('/blob/'): path = req_path.replace('/blob/', '', 1) branch_index = path.find('/') branch = path[:branch_index] path = path[branch_index:] if branch == '' or path == '' or path.endswith('/'): raise HandlerError( "Not found. Empty path for file in repository", 404) def make_pair(commit): # HTML Resource memento_path = '/blob/%s%s' % (commit['sha'], path) uri_m = '%s%s%s/%s%s' % (protocol, base, user, repo, memento_path) return (uri_m, commit['commit']['committer']['date']) mapper = make_pair # Resource is a directory elif req_path.startswith('/tree/'): path = req_path.replace('/tree/', '', 1) branch_index = path.find('/') if branch_index < 0: branch_index = len(path) branch = path[:branch_index] path = path[branch_index:] if branch == '': raise HandlerError("Not found. Empty branch path", 404) def make_pair(commit): return (commit['html_url'].replace('commit', 'tree') + path, commit['commit']['committer']['date']) mapper = make_pair # Resource is a raw file elif base == 'raw.githubusercontent.com/' and req_path is not None: path = req_path.replace('/', '', 1) branch_index = path.find('/') branch = path[:branch_index] path = path[branch_index:] # must be done because API does not make any difference between # path or files is_online = bool(requests.head(uri)) if path == '' or path.endswith('/') or not is_online: raise HandlerError( "'%s' not found: Raw resource must be a file." % path, 404) def make_pair(commit): memento_path = '/%s%s' % (commit['sha'], path) uri_m = '%s%s%s/%s%s' % (protocol, base, user, repo, memento_path) return (uri_m, commit['commit']['committer']['date']) mapper = make_pair if mapper is None: # The resource is not accepcted. raise HandlerError( "GitHub resource type not found." + ACCEPTABLE_RESOURCE, 404) # Initiating request variables apibase = '%s/repos/%s/%s/commits' % (self.api, user, repo) params = { 'per_page': 100, # Max allowed is 100 'path': str(path), 'sha': str(branch) } aut_pair = ('MementoTimegate', 'LANLTimeGate14') cont = apibase # The first continue is the beginning # Does sequential queries to get all commits of the particular resource queries_results = [] tmax = int(time.time()) + MAX_TIME while cont is not None: if int(time.time()) > tmax: raise HandlerError( "Resource too big to be served. GitHub Handler TimeOut (timeout: %d seconds)" % MAX_TIME, 502) req = self.request(cont, params=params, auth=aut_pair) cont = None if not req: # status code different than 2XX raise HandlerError( "Cannot find resource on version server. API response %d'd " % req.status_code, 404) result = req.json() if 'message' in result: # API-specific error raise HandlerError(result['message']) if 'errors' in result: # API-specific error raise HandlerError(result['errors']) if len(result) > 0: # The request was successful queries_results += result # Search for possible continue if 'link' in req.headers: link_header = req.headers['link'] headermatch = self.header_rex.search(link_header) if bool(headermatch): # The response was truncated, the rest can be obtained using # the given "next" link cont = headermatch.groups()[0] if queries_results: # Processes results based on resource type return map(mapper, queries_results) else: # No results found raise HandlerError("Resource not found, empty response from API", 404)
def get_memento(self, req_url, dt): p = urlparse(req_url) host = p[1] upath = p[2] if host.find('.wikia.com') == -1 and not host in self.hosts: return exploded_path = upath.rsplit('/', 1) if len(exploded_path) > 1: (pref, title) = upath.rsplit('/', 1) if pref: # look for /wiki pref = pref.replace('/wiki', '') else: raise HandlerError("No article title found in requested URI.", 404) changes = [] defaultProtocol = "http://" dtfmstr = "%Y%m%d%H%M%S" dt_del = timedelta(seconds=1) dt_next = dt + dt_del dt_next = dt_next.strftime(dtfmstr) dt = dt.strftime(dtfmstr) url_list = [] # url for getting the memento, prev mem_prev = "%s%s/api.php?format=xml&action=query&prop=revisions&rvprop=timestamp|ids|user&rvlimit=2&redirects=1&titles=%s&rvdir=older&rvstart=%s" % ( defaultProtocol, host, title, dt) url_list.append('mem_prev') # url for next if dt_next: next = "%s%s/api.php?format=xml&action=query&prop=revisions&rvprop=timestamp|ids|user&rvlimit=2&redirects=1&titles=%s&rvdir=newer&rvstart=%s" % ( defaultProtocol, host, title, dt) url_list.append('next') # url for last last = "%s%s/api.php?format=xml&action=query&prop=revisions&rvprop=timestamp|ids|user&rvlimit=1&redirects=1&titles=%s" % ( defaultProtocol, host, title) url_list.append('last') # url for first first = "%s%s/api.php?format=xml&action=query&prop=revisions&rvprop=timestamp|ids|user&rvlimit=1&redirects=1&rvdir=newer&titles=%s" % ( defaultProtocol, host, title) url_list.append('first') #url = url % (title, dt) base = "%s%s%s/index.php?title=%s&oldid=" % \ (defaultProtocol, host, pref, title) dtobj = None hdrs = {} hdrs['Host'] = host for url in url_list: dom = self.get_xml(vars()[url], headers=hdrs) revs = dom.xpath('//rev') for r in revs: dt = r.attrib['timestamp'] dtobj = dateparser.parse(r.attrib['timestamp']) changes.append((base + r.attrib['revid'], dt)) return changes