예제 #1
0
    def get_xml(self, uri, html=False):
        """Retrieve the resource using the url.

        It parses response as XML or HTML and returns the parsed DOM object.

        :param uri: [str] The uri to retrieve.
        :param headers: [dict(header_name: value)] Optional HTTP headers to
        send in the request.
        :param html: [bool] Optional flag to parse the response as HTML.
        :return: [lxml_obj] Parsed DOM.
        """
        try:
            page = self.request(uri)
        except HandlerError as he:
            raise HandlerError(he, status=404)

        try:
            page_data = page.content
            if not html:
                parser = etree.XMLParser(recover=True)
            else:
                parser = etree.HTMLParser(recover=True)
            return etree.parse(StringIO.StringIO(page_data), parser)
        except Exception as e:
            logging.error("Cannot parse XML/HTML from %s" % uri)
            raise HandlerError("Couldn't parse data from %s" % uri, 404)
예제 #2
0
    def get_memento(self, req_uri, accept_datetime):

        logging.debug("Begin Fetching mementos for: %s" % req_uri)

        p = urlparse.urlparse(req_uri)
        host = p[1]

        for h in self.hosts:
            if host.find(h) == -1:
                return

        timestamp = date_str(accept_datetime, self.TIMESTAMPFMT)
        params = {
            'rvlimit': 1,  # Only need one
            'rvstart': timestamp,  # Start listing from here
            'rvdir': 'older'  # List in decreasing order
        }

        # Finds the API and title using scraping
        api_base_uri = None
        try:
            dom = self.get_xml(req_uri, html=True)
            links = dom.xpath("//link")
            for link in links:
                if link.attrib['rel'].lower() == "edituri":
                    api_base_uri = link.attrib['href'].split("?")[0]
                    if api_base_uri.startswith("//"):
                        api_base_uri = api_base_uri.replace("//", "http://")
            parsed_url = urlparse.urlparse(req_uri)
            try:
                title = urlparse.parse_qs(parsed_url[4])['title'][0]
            except Exception as e:
                title = parsed_url.path.split('/')[-1]
            logging.debug(
                "Orain handler: API found: %s, page title parsed to: %s " %
                (api_base_uri, title))
            if not title:
                raise HandlerError("Cannot find Title", 404)
            if not api_base_uri:
                raise HandlerError("Cannot find orain API on page", 404)
            else:
                title = urllib2.unquote(title)

        except HandlerError as he:
            raise he
        except Exception as e:
            logging.error(
                "OrainHandler: querying and parsing page for title/api %s. handler will return empty response"
                % e)
            return None

        base_uri = api_base_uri.replace("api.php", "index.php")

        return self.query(req_uri, params, title, api_base_uri, base_uri)
예제 #3
0
    def get_all_mementos(self, req_url):
        # implement the changes list for this particular proxy

        parameters = {}
        parameters['q'] = req_url
        parameters['subject'] = 'url'

        uri = baseuri + urllib.urlencode(parameters)
        try:
            jsonobj = self.request(uri).json()
        except Exception as e:
            logging.error("Cannot request API or parse json response: " + e)
            raise HandlerError("Cannot get API response.", 404)

        changes = []

        if int(jsonobj['availableHits']) == 0:
            return []

        tmid = jsonobj['hits'][0]['ID']
        tmuri = "http://haw.nsk.hr/publikacija/" + tmid

        try:
            data = self.request(tmuri).content
        except Exception as e:
            logging.error("Error requerying API: " + e)
            raise HandlerError("Cannot get API response.", 404)

        uriRegex = re.compile(r'<tr><td>[\d]*\.</td>.*</tr>')
        dtregex = re.compile('<td>\d\d\.\d\d\.\d\d\d\d[0-9\.:\s]*</td>')

        uris = re.findall(uriRegex, data)
        for u in uris:
            d = u.index("title")

            loc = "http://haw.nsk.hr/" + u[45:d - 2].lstrip('/')

            result = dtregex.search(u)
            if result:
                dtstr = result.group(0)
            dtstr = dtstr[4:-5]

            dtstr = dtstr[6:10] + dtstr[3:5] + dtstr[0:2] + \
                dtstr[11:19].replace(":", "") + " GMT"
            changes.append((loc, dtstr))

        return changes
예제 #4
0
    def get_all_mementos(self, requri):

        if requri == 'http://lanlsource.lanl.gov/hello':
            wcurl = 'http://webcitation.org/5jq247bmx'
        elif requri == 'http://lanlsource.lanl.gov/pics/picoftheday.png':
            wcurl = 'http://webcitation.org/5jq24MRo3'
        elif requri == 'http://odusource.cs.odu.edu/pics/picoftheday.png':
            wcurl = 'http://webcitation.org/5k9j4oXPw'
        else:
            return self.get_from_xml(requri)  # Cleaner but much slower
            # wcurl = 'http://webcitation.org/query.php?url=' + requri  # Fast
            # screen scraping

        txheaders = {}

        try:
            req = urllib2.Request(wcurl, None, txheaders)
            fh = urllib2.urlopen(req)
            fh.close()

            req = urllib2.Request('http://webcitation.org/topframe.php')
            fh = urllib2.urlopen(req)
            data = fh.read()
            fh.close()
        except Exception as e:
            raise HandlerError('Cannot request page', 404)

        changes = []

        try:
            parser = etree.HTMLParser()
            dom = etree.parse(StringIO.StringIO(data), parser)
        except:
            raise HandlerError('Cannot parse HTML')

        opts = dom.xpath('//select[@name="id"]/option')
        for o in opts:
            fid = o.attrib['value']
            date = o.text
            if date.find('(failed)') > -1:
                continue

            changes.append(('http://webcitation.org/query?id=' + fid, date))

        return changes
예제 #5
0
    def get_all_mementos(self, uri_r):
        try:
            # Extract the resource ID
            match = self.rex.match(uri_r)
            if not match:
                raise HandlerError("URI does not match a valid resource.", 404)
            parts = match.groups()
            base = parts[0]
            type = parts[1]
            resource = parts[2]
            normalized_uri = '%s/%s/%s' % (base, type, resource)

            # Prepars the API call
            params = {
                'verb': 'GetRecord',
                'identifier': 'oai:arXiv.org:%s' % resource,
                'metadataPrefix': 'arXivRaw'
            }

            # Queries the API and extract the values
            response = self.request(self.api_base, params=params)
            if not response:
                raise HandlerError("API response not 2XX", 404)
            root = etree.parse(StringIO(response.content),
                               etree.XMLParser(recover=True))
            versions = root.findall(
                './/{http://arxiv.org/OAI/arXivRaw/}version')

            # Processes the return
            def mapper(version):
                v = version.xpath('@*')[0]
                date = version.find(
                    './{http://arxiv.org/OAI/arXivRaw/}date').text
                return (normalized_uri + v, date)

            return map(mapper, versions)

        except HandlerError as he:
            raise he

        except Exception as e:
            logging.error('Arxiv handler exception: %s returning 404' % e)
            return
예제 #6
0
    def get_memento(self, uri_r, req_datetime):
        uri_r = uri_r + '/'
        # Check if the URI is one archived website
        matches = [x for x in self.pages_list if uri_r.startswith(x[0])]
        if len(matches) == 0:
            raise HandlerError(
                "Pastpages does not have archives of that website.", 404)
        if len(matches) > 1:
            logging.error("Uri conflict in pastpages' API URI list.")
            raise HandlerError("Error in pastpages API")

        site_slug = matches[0][1]
        params = {
            'limit': 1,
            'site__slug': site_slug,
            'timestamp__lte': req_datetime.strftime(self.API_TIMEFMT)
        }

        request = '/api/beta/screenshots/'

        json_response = self.request(self.BASE + request, params=params).json()
        if 'error' in json_response:
            logging.error("Error in pastpages response: " +
                          str(json_response['error']))
            return

        result_list = [
            # 'objects' is the list of responses
            # 'objects.absolute_url' is the URI. It exists if 'objects.has_image'
            (self.BASE + obj['absolute_url'], obj['timestamp'])
            for obj in json_response['objects']
        ]
        if result_list:
            if len(result_list) > 1:
                logging.error(
                    "API returned more than one object. returning the first")
            return result_list[0]

        # No Memento Found, Trying the first
        else:
            return
예제 #7
0
 def get_xml(self, uri, html=False):
     page = self.request(uri)
     try:
         page_data = page.content
         if not html:
             parser = etree.XMLParser(recover=True)
         else:
             parser = etree.HTMLParser(recover=True)
         return etree.parse(StringIO.StringIO(page_data), parser)
     except Exception as e:
         logging.error("Cannot parse XML/HTML from %s" % uri)
         raise HandlerError("Couldn't parse data from %s" % uri, 404)
예제 #8
0
    def get_all_mementos(self, uri_r):
        # WILL BE TOO SLOW. TOO MANY WEBSITES'
        # Deactivate TimeMaps
        logging.warning(
            "Get_all_mementos used: Pastpages will probably have too big timemaps. Expect Timeouts"
        )

        matches = [x for x in self.pages_list if uri_r.startswith(x[0])]
        if len(matches) == 0:
            raise HandlerError(
                "Pastpages does not have archives of that website.", 404)
        if len(matches) > 1:
            logging.error("Uri conflict in pastpages' API URI list.")
            raise HandlerError("Error in pastpages API")

        site_slug = matches[0][1]
        params = {'limit': self.LIMIT_MAX, 'site__slug': site_slug}
        request = '/api/beta/screenshots/'
        has_next = True

        image_list = []
        # Keep while there are still result pages
        while has_next:
            json_response = self.request(self.BASE + request,
                                         params=params).json()

            image_list.extend([
                # 'objects' is the list of responses
                # 'objects.image' is the URI of the memento. It exists if 'objects.has_image'
                (self.BASE + obj['absolute_url'], obj['timestamp'])
                for obj in json_response['objects'] if obj['has_image']
            ])

            request = json_response['meta']['next']
            params = None  # the request already contains &limit and &offset
            # Each response has a non null 'meta.next' value if it has a
            # continuation
            has_next = request is not None

        return image_list
예제 #9
0
    def get_all_mementos(self, uri):
        MAX_TIME = 120  #seconds

        match_spec_name = self.re_spec_name.match(uri)
        if not bool(match_spec_name):
            raise HandlerError(
                "Unknown W3C specification uri. \n" + ACCEPTABLE_RESOURCE, 404)

        spec_name = match_spec_name.groups()[1]
        if spec_name.endswith("/"):
            spec_name = spec_name[:-1]

        api_response = self.request(self.api_url % (spec_name, APIKEY))

        if not api_response.status_code == 200:
            raise HandlerError(
                "No versions were found for the requested specification with shortname: %s"
                % spec_name, 404)

        json_response = {}
        try:
            json_response = api_response.json()
            #for versions in json_response.get("_embedded").get("versions"):
            #    spec_versions.append((versions.get("uri"), versions.get("date")))
        except:
            raise HandlerError("The W3C API returned an unknown response.",
                               502)

        if not json_response.get("_embedded") and json_response.get(
                "_embedded").get("versions"):
            raise HandlerError("The W3C API returned an unknown response.",
                               502)

        versions = map(
            lambda version: (version.get("uri"), version.get("date")),
            json_response.get("_embedded").get("version-history"))
        #return versions
        return sorted(versions, key=lambda version: version[1])
예제 #10
0
    def get_memento(self, uri_r, req_datetime):
        # Suppose you have a special rule for certain dates
        if req_datetime.year < 1999:
            # In this case, we do not serve anything before 2001
            # Return a custom Error to the client
            raise HandlerError("Cannot server a Memento before 1999",
                               status=404)
        else:
            # Gets all mementos for this URI
            mementos_list = self.get_all_mementos(uri_r)

            # Find the best single memento is returned for this uri_r and this
            # date
            (uri_m, date_time) = mementos_list[-1]
            # In this example we take the last one

            return (uri_m, date_time)  # The return value is a tuple here.
예제 #11
0
파일: es.py 프로젝트: skbly7/amber_timegate
    def get_all_mementos(self, req_url):
        # implement the changes list for this particular proxy

        uri = BASEURI + req_url
        try:
            resp = self.request(uri)
            data = resp.content
        except Exception as e:
            logging.error("Cannot request URI: %s" % e)
            raise HandlerError("Cannot request URI", 404)

        changes = []
        uris = re.findall(self.uriRegex, data)
        for u in uris:
            dtstr = u[0]
            loc = u[1]
            dtstr += " GMT"
            changes.append((loc, dtstr))

        return changes
예제 #12
0
    def get_from_xml(self, requri):
        api_request = 'http://webcitation.org/query.php?returnxml=1&url=' + requri
        xml = self.request(api_request, timeout=120)

        try:
            parser = etree.XMLParser(recover=True)  # Parses bad XML
            dom = etree.parse(StringIO.StringIO(str(xml.text)), parser)
        except Exception as e:
            logging.error('Cannot parse XML: ' + str(e))
            raise HandlerError('Cannot parse XML', 404)

        results = []
        succes = dom.xpath("//result[@status='success']")
        for s in succes:
            url = s.find('webcite_url').text
            date = s.find('timestamp').text

            results.append((url, date))

        return results
예제 #13
0
    def get_xml(self, uri, params=None, html=False):
        """Retrieves the resource using the url, parses it as XML or HTML and
        returns the parsed dom object.

        :param uri: [str] The uri to retrieve :param headers:
        [dict(header_name: value)] optional http headers to send in the
        request :param html: [bool] optional flag to parse the response
        as HTML :return: [lxml_obj] parsed dom.

        """

        page = self.request(uri, params=params)
        try:
            page_data = page.content
            if not html:
                parser = etree.XMLParser(recover=True)
            else:
                parser = etree.HTMLParser(recover=True)
            return etree.parse(StringIO.StringIO(page_data), parser)
        except Exception:
            logging.error("Cannot parse XML/HTML from %s" % uri)
            raise HandlerError("Couldn't parse data from %s" % uri, 404)
예제 #14
0
    def get_all_mementos(self, uri):
        MAX_TIME = 120  # seconds

        # URI deconstruction
        match = self.rex.match(uri)
        if not bool(match):
            raise HandlerError("Github uri does not match a valid resource. \n"
                               + ACCEPTABLE_RESOURCE, 404)
        protocol = match.groups()[0]
        base = match.groups()[1]
        user = match.groups()[2]
        repo = match.groups()[3]
        req_path = match.groups()[4]

        path = ''
        branch = ''
        # Processes one result to (memento, datetime) pair
        mapper = None

        # Defining Resource type and response handling
        # Creates one function for a specific type to map the results to
        # memento pairs.
        if 1:
            # Resource is a repository
            if not req_path or req_path == '/':
                if req_path:
                    path = '/'

                def make_pair(commit):
                    memento_path = '/commit/%s' % commit['id']
                    uri_m = '%s%s/%s/%s%s' % (
                        protocol, base, user, repo, memento_path)
                    return (uri_m, commit['created_at'])
                mapper = make_pair

            # Resource is a file
            elif req_path.startswith('/blob/'):
                path = req_path.replace('/blob/', '', 1)
                branch_index = path.find('/')
                branch = path[:branch_index]
                path = path[branch_index:]
                if branch == '' or path == '' or path.endswith('/'):
                    raise HandlerError(
                        "Not found. Empty path for file in repository", 404)

                def make_pair(commit):
                    # HTML Resource
                    memento_path = '/blob/%s%s' % (commit['id'], path)
                    uri_m = '%s%s/%s/%s%s' % (
                        protocol, base, user, repo, memento_path)
                    return (uri_m, commit['created_at'])
                mapper = make_pair

            # Resource is a raw file
            elif req_path.startswith('/raw/'):
                path = req_path.replace('/raw/', '', 1)
                branch_index = path.find('/')
                branch = path[:branch_index]
                path = path[branch_index:]
                is_online = bool(requests.head(
                    uri, params={'private_token': self.apikey}))
                if path == '' or path.endswith('/') or not is_online:
                    raise HandlerError(
                        "'%s' not found: Raw resource must be a file." %
                        path, 404)

                def make_pair(commit):
                    # HTML Resource
                    memento_path = '/raw/%s%s' % (commit['id'], path)
                    uri_m = '%s%s/%s/%s%s' % (
                        protocol, base, user, repo, memento_path)
                    return (uri_m, commit['created_at'])
                mapper = make_pair

            # Resource is a directory
            elif req_path.startswith('/tree/'):
                path = req_path.replace('/tree/', '', 1)
                branch_index = path.find('/')
                if branch_index < 0:
                    branch_index = len(path)
                branch = path[:branch_index]
                path = path[branch_index:]
                if branch == '':
                    raise HandlerError("Not found. Empty branch path", 404)

                def make_pair(commit):
                    memento_path = '/commit/%s' % commit['id']
                    uri_m = '%s%s/%s/%s%s' % (
                        protocol, base, user, repo, memento_path)
                    return (uri_m, commit['created_at'])
                mapper = make_pair

            # Resource is a wiki entry
            # e.g.
            # https://gitlab.example.com/opac/cdrom-opac/wikis/home -->
            # https://gitlab.example.com/opac/cdrom-opac/wikis/home?version_id=b4a9027e2948a5ce9ecd3a9c1641ed958b9f7728
            # API does not seem to support this: getting wrong commit IDs
            # elif req_path.startswith('/wikis/'):
            #     def make_pair(commit):
            #         # HTML Resource
            #         memento_path = '%s?version_id=%s' % (req_path, commit['id'])
            #         uri_m = '%s%s/%s/%s%s' % (
            #             protocol, base, user, repo, memento_path)
            #         return (uri_m, commit['created_at'])
            #     mapper = make_pair

        if mapper is None:
            # The resource is not accepcted.
            raise HandlerError(
                "GitLab resource type not found." + ACCEPTABLE_RESOURCE, 404)

        # Initiating request variables
        # It appears that user/repo can be used instead of a numeric project
        # ID. %2f is a urlencoded slash (/).
        apibase = '%s/projects/%s/repository/commits' % (
            self.api, user + '%2f' + repo)
        params = {
            'per_page': 100,  # Max allowed is 100
            'path': str(path),
            'branches': str(branch),
            'private_token': self.apikey
        }
        aut_pair = ('MementoTimegate', 'LANLTimeGate14')
        cont = apibase  # The first continue is the beginning

        # Does sequential queries to get all commits of the particular resource
        queries_results = []
        tmax = int(time.time()) + MAX_TIME
        while cont is not None:
            if int(time.time()) > tmax:
                raise HandlerError(
                    "Resource too big to be served. GitLab Handler TimeOut (timeout: %d seconds)" %
                    MAX_TIME, 502)
            req = self.request(cont, params=params, auth=aut_pair)
            cont = None
            if not req:
                # status code different than 2XX
                raise HandlerError(
                    "Cannot find resource on version server. API response %d'd " %
                    req.status_code, 404)
            result = req.json()
            if 'message' in result:
                # API-specific error
                raise HandlerError(result['message'])
            if 'errors' in result:
                # API-specific error
                raise HandlerError(result['errors'])
            if len(result) > 0:
                # The request was successful
                queries_results += result
                # Search for possible continue
                if 'link' in req.headers:
                    link_header = req.headers['link']
                    headermatch = self.header_rex.search(link_header)
                    if bool(headermatch):
                        # The response was truncated, the rest can be obtained using
                        # the given "next" link
                        cont = headermatch.groups()[0]

        if queries_results:
            # Processes results based on resource type
            return map(mapper, queries_results)
        else:
            # No results found
            raise HandlerError(
                "Resource not found, empty response from API", 404)
예제 #15
0
    def query(self, req_uri, req_params, title, api_base_uri, base_uri):

        params = {
            'action': 'query',
            'format': 'json',
            'prop': 'revisions',
            'rvprop': 'ids|timestamp',
            'indexpageids': '',
            'titles': title
        }
        params.update(req_params)

        # Does sequential queries to get all revisions IDs and Timestamps
        queries_results = []
        condition = True
        while condition:
            # Clone original request
            newparams = params.copy()
            req = self.request(api_base_uri, params=newparams)
            try:
                result = req.json()
            except Exception as e:
                logging.error("No JSON can be decoded from API %s" %
                              api_base_uri)
                raise HandlerError("No API answer.", 404)
            if 'error' in result:
                raise HandlerError(result['error'])
            if 'warnings' in result:
                # logging.warn(result['warnings'])
                pass
            try:
                # The request was successful
                # the JSON key of the page (only one)
                pid = result['query']['pageids'][0]
                queries_results += result['query']['pages'][pid]['revisions']
                if ('missing' in result['query']['pages'][pid] or
                        'invalid' in result['query']['pages'][pid]):
                    raise HandlerError(
                        "Cannot find resource on version server.", 404)
            except Exception as e:
                if req_params['rvdir'] == 'older':
                    req_params['rvdir'] = 'newer'
                    return self.query(
                        req_uri, req_params, title, api_base_uri, base_uri)
                else:
                    raise HandlerError("No revision returned from API.", 404)
            if 'continue' in result:
                # The response was truncated, the rest can be obtained using
                # &rvcontinue=ID
                cont = result['continue']
                # Modify it with the values returned in the 'continue' section
                # of the last result.
                newparams.update(cont)
                condition = True
            else:
                condition = False

        # Processing list
        def f(rev):
            rev_uri = base_uri + '?title=%s&oldid=%d' % (
                urllib2.quote(title), rev['revid'])
            dt = rev['timestamp']
            return (rev_uri, dt)

        # logging.debug("Returning API results of size %d" % len(queries_results))
        return map(f, queries_results)
예제 #16
0
    def get_memento(self, req_uri, accept_datetime):
        timestamp = date_str(accept_datetime, self.TIMESTAMPFMT)
        params = {
            'rvlimit': 1,  # Only need one
            'rvstart': timestamp,  # Start listing from here
            'rvdir': 'older'  # List in decreasing order
        }

        # Finds the API and title using scraping
        api_base_uri = None
        try:
            dom = self.get_xml(req_uri, html=True)
            links = dom.xpath("//link")
            for link in links:
                if link.attrib['rel'].lower() == "edituri":
                    api_base_uri = link.attrib['href'].split("?")[0]
                    if api_base_uri.startswith("//"):
                        api_base_uri = api_base_uri.replace("//", "http://")
            parsed_url = urlparse.urlparse(req_uri)
            try:
                title = urlparse.parse_qs(parsed_url[4])['title'][0]
            except Exception as e:
                title = parsed_url.path.split('/')[-1]
            logging.debug(
                "Mediawiki handler: API found: %s, page title parsed to: %s " %
                (api_base_uri, title))
            if not title:
                raise HandlerError("Cannot find Title", 404)
            if not api_base_uri:
                raise HandlerError("Cannot find mediawiki API on page", 404)
            else:
                title = urllib2.unquote(title)

        except HandlerError as he:
            raise he
        except Exception as e:
            logging.error(
                "MediaWikiHandler: querying and parsing page for title/api %s."
                " Handler will return empty response." % e)
            return None

        base_uri = api_base_uri.replace("api.php", "index.php")

        # The best Memento
        memento = self.query(req_uri, params, title, api_base_uri, base_uri)[0]

        # The first Memento
        if title in self.inner_cache and memento:
            logging.debug("Wiki Handler: found cached first for " + title)
            first = self.inner_cache[title]
        else:
            logging.debug("Wiki Handler: Querying first for " + title)
            first_params = {
                'rvlimit': 1,  # Only need one
                'rvstart': '19900101000000',  # Start listing from 1990
                'rvdir': 'newer'  # List in increasing order
            }
            first = self.query(req_uri, first_params, title, api_base_uri,
                               base_uri)[0]
            if len(self.inner_cache) > self.max_inner_cache_size:
                self.inner_cache = {}
            self.inner_cache[title] = first

        # This handler returns more than only the best Memento.
        # A Link with rel="first memento" will also be returned to the client.
        return [first, memento]
예제 #17
0
    def get_all_mementos(self, uri):
        MAX_TIME = 120  # seconds

        if uri.startswith('http://'):
            uri = uri.replace('http://', 'https://', 1)

        # URI deconstruction
        match = self.rex.match(uri)
        if not bool(match):
            raise HandlerError(
                "Github uri does not match a valid resource. \n" +
                ACCEPTABLE_RESOURCE, 404)
        protocol = match.groups()[0]
        base = match.groups()[1]
        user = match.groups()[2]
        repo = match.groups()[3]
        req_path = match.groups()[4]

        path = ''
        branch = ''
        # Processes one result to (memento, datetime) pair
        mapper = None

        # Defining Resource type and response handling
        # Creates one function for a specific type to map the results to
        # memento pairs.
        if base == 'github.com/':
            # Resource is a repository
            if not req_path or req_path == '/':
                if req_path:
                    path = '/'

                def make_pair(commit):
                    return (commit['html_url'].replace('commit', 'tree'),
                            commit['commit']['committer']['date'])

                mapper = make_pair

            # Resource is a file
            elif req_path.startswith('/blob/'):
                path = req_path.replace('/blob/', '', 1)
                branch_index = path.find('/')
                branch = path[:branch_index]
                path = path[branch_index:]
                if branch == '' or path == '' or path.endswith('/'):
                    raise HandlerError(
                        "Not found. Empty path for file in repository", 404)

                def make_pair(commit):
                    # HTML Resource
                    memento_path = '/blob/%s%s' % (commit['sha'], path)
                    uri_m = '%s%s%s/%s%s' % (protocol, base, user, repo,
                                             memento_path)
                    return (uri_m, commit['commit']['committer']['date'])

                mapper = make_pair

            # Resource is a directory
            elif req_path.startswith('/tree/'):
                path = req_path.replace('/tree/', '', 1)
                branch_index = path.find('/')
                if branch_index < 0:
                    branch_index = len(path)
                branch = path[:branch_index]
                path = path[branch_index:]
                if branch == '':
                    raise HandlerError("Not found. Empty branch path", 404)

                def make_pair(commit):
                    return (commit['html_url'].replace('commit', 'tree') +
                            path, commit['commit']['committer']['date'])

                mapper = make_pair

        # Resource is a raw file
        elif base == 'raw.githubusercontent.com/' and req_path is not None:
            path = req_path.replace('/', '', 1)
            branch_index = path.find('/')
            branch = path[:branch_index]
            path = path[branch_index:]
            # must be done because API does not make any difference between
            # path or files
            is_online = bool(requests.head(uri))
            if path == '' or path.endswith('/') or not is_online:
                raise HandlerError(
                    "'%s' not found: Raw resource must be a file." % path, 404)

            def make_pair(commit):
                memento_path = '/%s%s' % (commit['sha'], path)
                uri_m = '%s%s%s/%s%s' % (protocol, base, user, repo,
                                         memento_path)
                return (uri_m, commit['commit']['committer']['date'])

            mapper = make_pair

        if mapper is None:
            # The resource is not accepcted.
            raise HandlerError(
                "GitHub resource type not found." + ACCEPTABLE_RESOURCE, 404)

        # Initiating request variables
        apibase = '%s/repos/%s/%s/commits' % (self.api, user, repo)
        params = {
            'per_page': 100,  # Max allowed is 100
            'path': str(path),
            'sha': str(branch)
        }
        aut_pair = ('MementoTimegate', 'LANLTimeGate14')
        cont = apibase  # The first continue is the beginning

        # Does sequential queries to get all commits of the particular resource
        queries_results = []
        tmax = int(time.time()) + MAX_TIME
        while cont is not None:
            if int(time.time()) > tmax:
                raise HandlerError(
                    "Resource too big to be served. GitHub Handler TimeOut (timeout: %d seconds)"
                    % MAX_TIME, 502)
            req = self.request(cont, params=params, auth=aut_pair)
            cont = None
            if not req:
                # status code different than 2XX
                raise HandlerError(
                    "Cannot find resource on version server. API response %d'd "
                    % req.status_code, 404)
            result = req.json()
            if 'message' in result:
                # API-specific error
                raise HandlerError(result['message'])
            if 'errors' in result:
                # API-specific error
                raise HandlerError(result['errors'])
            if len(result) > 0:
                # The request was successful
                queries_results += result
                # Search for possible continue
                if 'link' in req.headers:
                    link_header = req.headers['link']
                    headermatch = self.header_rex.search(link_header)
                    if bool(headermatch):
                        # The response was truncated, the rest can be obtained using
                        # the given "next" link
                        cont = headermatch.groups()[0]

        if queries_results:
            # Processes results based on resource type
            return map(mapper, queries_results)
        else:
            # No results found
            raise HandlerError("Resource not found, empty response from API",
                               404)
예제 #18
0
    def get_memento(self, req_url, dt):
        p = urlparse(req_url)
        host = p[1]
        upath = p[2]

        if host.find('.wikia.com') == -1 and not host in self.hosts:
            return

        exploded_path = upath.rsplit('/', 1)

        if len(exploded_path) > 1:
            (pref, title) = upath.rsplit('/', 1)
            if pref:
                # look for /wiki
                pref = pref.replace('/wiki', '')
        else:
            raise HandlerError("No article title found in requested URI.", 404)

        changes = []
        defaultProtocol = "http://"

        dtfmstr = "%Y%m%d%H%M%S"

        dt_del = timedelta(seconds=1)
        dt_next = dt + dt_del
        dt_next = dt_next.strftime(dtfmstr)
        dt = dt.strftime(dtfmstr)

        url_list = []

        # url for getting the memento, prev
        mem_prev = "%s%s/api.php?format=xml&action=query&prop=revisions&rvprop=timestamp|ids|user&rvlimit=2&redirects=1&titles=%s&rvdir=older&rvstart=%s" % (
            defaultProtocol, host, title, dt)
        url_list.append('mem_prev')

        # url for next
        if dt_next:
            next = "%s%s/api.php?format=xml&action=query&prop=revisions&rvprop=timestamp|ids|user&rvlimit=2&redirects=1&titles=%s&rvdir=newer&rvstart=%s" % (
                defaultProtocol, host, title, dt)
            url_list.append('next')

        # url for last
        last = "%s%s/api.php?format=xml&action=query&prop=revisions&rvprop=timestamp|ids|user&rvlimit=1&redirects=1&titles=%s" % (
            defaultProtocol, host, title)
        url_list.append('last')

        # url for first
        first = "%s%s/api.php?format=xml&action=query&prop=revisions&rvprop=timestamp|ids|user&rvlimit=1&redirects=1&rvdir=newer&titles=%s" % (
            defaultProtocol, host, title)
        url_list.append('first')

        #url = url % (title, dt)
        base = "%s%s%s/index.php?title=%s&oldid=" % \
               (defaultProtocol, host, pref, title)
        dtobj = None

        hdrs = {}
        hdrs['Host'] = host

        for url in url_list:

            dom = self.get_xml(vars()[url], headers=hdrs)
            revs = dom.xpath('//rev')
            for r in revs:
                dt = r.attrib['timestamp']
                dtobj = dateparser.parse(r.attrib['timestamp'])
                changes.append((base + r.attrib['revid'], dt))

        return changes