def test_urlresolve(self): url = 'tag://0' hostname = gethostname() self.assertEquals(urlresolve(url, master=None), 'http://{}:8989/ddfs/tag/0'.format(hostname)) self.assertRaises(DiscoError, lambda: urlresolve(url, master='disco-master'))
def curl(replicas): for replica in replicas: try: return download(proxy_url(urlresolve(replica, master=program.ddfs.master), to_master=False)) except Exception, e: sys.stderr.write("%s\n" % e)
def request(method, url, data=None, headers={}, sleep=0): scheme, netloc, path = urlsplit(urlresolve(url)) try: conn = HTTPConnection(str(netloc)) conn.request(method, '/{0}'.format(path), body=data, headers=headers) response = conn.getresponse() status = response.status errmsg = response.reason except httplib.HTTPException as e: status = None errmsg = str(e) or repr(e) except (httplib.socket.error, socket.error) as e: status = None errmsg = e if isinstance(e, basestring) else str(e) or repr(e) if not status or isunavailable(status): if sleep == 9: raise CommError(errmsg, url, status) time.sleep(random.randint(1, 2**sleep)) return request(method, url, data=data, headers=headers, sleep=sleep + 1) elif isredirection(status): loc = response.getheader('location') return request(method, loc if loc.startswith('http:') else resolveuri(url, loc), data=data, headers=headers, sleep=sleep) elif not issuccessful(status): raise CommError(response.read(), url, status) return response
def request(method, url, data=None, headers={}, sleep=0): scheme, netloc, path = urlsplit(urlresolve(url)) try: conn = HTTPConnection(str(netloc)) conn.request(method, '/%s' % path, body=data, headers=headers) response = conn.getresponse() except (httplib.HTTPException, httplib.socket.error), e: raise CommError("Request failed: %s" % e, url)
def __init__(self, urls, source): from disco.util import urlresolve self.multi = pycurl.CurlMulti() self.pending = [(url, HTTPConnection('').prepare('PUT', urlresolve(url), body=source)) for url in urls] for url, conn in self.pending: self.multi.add_handle(conn.handle)
def curl(replicas): for replica in replicas: try: return download(proxy_url(urlresolve(replica, master=program.ddfs.master), to_master=False)) except Exception as e: sys.stderr.write("{0}\n".format(e)) if not ignore_missing: raise Exception("Failed downloading all replicas: {0}".format(replicas)) return ''
def __init__(self, urls, source, token=None): from disco.util import urlresolve self.multi = pycurl.CurlMulti() headers = self.auth_header(token) self.pending = [(url, HTTPConnection('').prepare('PUT', urlresolve(url), body=source, headers=headers)) for url in urls] for url, conn in self.pending: self.multi.add_handle(conn.handle)
def request(method, url, data=None, headers={}, sleep=0): scheme, netloc, path = urlsplit(urlresolve(url)) try: conn = HTTPConnection(str(netloc)) conn.request(method, '/%s' % path, body=data, headers=headers) response = conn.getresponse() status = response.status errmsg = response.reason except httplib.HTTPException, e: status = None errmsg = str(e) or repr(e)
def get(self, uri): """Returns the `Document` with the specified `uri`.""" name, startpos, size = self.get_pos(uri) try: dump_uri = urlresolve(self.__dump_name_to_blob_uri(name)) except KeyError: raise DocumentNotFound("couldn't find doc with dump name '%s'" % name) req = urllib2.Request(dump_uri) req.add_header("Range", "bytes=%d-%d" % (startpos, startpos + size - 1)) res = urllib2.urlopen(req) return WARCParser(res).next()
def download(url, **kwargs): code, body = real_download(urlresolve(url), **kwargs) if code == 503: sleep = kwargs.get('sleep', 0) if sleep == 9: raise CommError("Too many 503 replies", url) else: time.sleep(2**sleep) kwargs['sleep'] = sleep + 1 return download(url, **kwargs) elif not str(code).startswith('2'): raise CommError(body, url, code) return body
def test_add_dump(self): self.docset.add_dump("d1", dump1) # check that it's in list of dumps self.assertTrue("d1" in self.docset.dump_names()) # check accessible over http from disco.util import urlresolve import urllib2 uri = list(self.docset.dump_uris())[0] httpuri = urlresolve(uri) d = urllib2.urlopen(httpuri).read() self.assertEquals(d, fixtures.warc_file1)
def index(self): # Lazily load index data from DDFS. if self.__index is None: blobs = [uri for (uri,) in self.ddfs.blobs(self.ddfs_index_tag)] if len(blobs) == 0: self.__index = {} self.__index_version = 0 else: # Find blob with highest version number. ver, discouri = sorted([(self.__blob_uri_to_dump_name(uri), uri) for uri in blobs], reverse=True)[0] uri = urlresolve(discouri) data = urllib2.urlopen(uri).read() try: self.__index = pickle.loads(data) self.__index_version = int(ver) except EOFError: raise EOFError("EOF reading docset index at %s in tag %s" % \ (uri, self.ddfs_index_tag)) return self.__index
def xcat(program, *urls): """Usage: [urls ...] Concatenate the extracted results stored in url[s] and print to stdout. If any of the url[s] are tags, the blobs reachable from the tags will be printed after any non-tag url[s]. """ from itertools import chain from disco.core import classic_iterator from disco.util import iterify, reify, urlresolve, proxy_url tags, urls = program.separate_tags(*program.input(*urls)) stream = reify(program.options.stream) reader = program.options.reader reader = reify('disco.func.chain_reader' if reader is None else reader) bloburls = [[proxy_url(urlresolve(u), to_master=False) for u in repset] for repset in chain(urls, program.blobs(*tags))] for record in classic_iterator(bloburls, input_stream=stream, reader=reader): print '\t'.join('%s' % (e, ) for e in iterify(record)).rstrip()
def xcat(program, *urls): """Usage: [urls ...] Concatenate the extracted results stored in url[s] and print to stdout. If any of the url[s] are tags, the blobs reachable from the tags will be printed after any non-tag url[s]. """ from itertools import chain from disco.core import classic_iterator from disco.util import iterify, reify, urlresolve, proxy_url tags, urls = program.separate_tags(*program.input(*urls)) stream = reify(program.options.stream) reader = program.options.reader reader = reify('disco.func.chain_reader' if reader is None else reader) bloburls = [[proxy_url(urlresolve(u), to_master=False) for u in repset] for repset in chain(urls, program.blobs(*tags))] for record in classic_iterator(bloburls, input_stream=stream, reader=reader): print '\t'.join('%s' % (e,) for e in iterify(record)).rstrip()
def request(method, url, data=None, headers={}, sleep=0): scheme, netloc, path = urlsplit(urlresolve(url)) # This fixes a problem with Unicode errors in Python 2.7 # works in Python 2.6 as well, but not earlier versions try: if data is not None: data = bytearray(data) except NameError: # In Python < 2.6, bytearray doesn't exist pass try: conn = HTTPConnection(str(netloc)) conn.request(method, '/%s' % path, body=data, headers=headers) response = conn.getresponse() status = response.status errmsg = response.reason except httplib.HTTPException, e: status = None errmsg = str(e) or repr(e)
def request(method, url, data=None, headers={}, sleep=0): scheme, netloc, path = urlsplit(urlresolve(url)) try: conn = HTTPConnection(str(netloc)) conn.request(method, '/{0}'.format(path), body=data, headers=headers) response = conn.getresponse() status = response.status errmsg = response.reason except httplib.HTTPException as e: status = None errmsg = str(e) or repr(e) except (httplib.socket.error, socket.error) as e: status = None errmsg = e if isinstance(e, basestring) else str(e) or repr(e) if not status or isunavailable(status): if sleep == 9: raise CommError(errmsg, url, status) time.sleep(random.randint(1, 2**sleep)) return request(method, url, data=data, headers=headers, sleep=sleep + 1) elif isredirection(status): loc = response.getheader('location') return request( method, loc if loc.startswith('http:') else resolveuri(url, loc), data=data, headers=headers, sleep=sleep) elif not issuccessful(status): raise CommError(response.read(), url, status) return response
def open_remote(url, token=None): conn = Connection(urlresolve(url), token) return conn, len(conn), conn.url
def open_remote(url, token=None): return Connection(urlresolve(url), token)
def open_remote(url): conn = Conn(urlresolve(url)) return conn, conn.length(), conn.url
def indexurl(self, indexspec): resource = urlparse.urlparse(indexspec) if resource.netloc: return urlresolve(indexspec) path = '/indices/%s' % indexspec return urlparse.urlunparse(('http', self.netloc, path, '', '', ''))
def _resolve(self, url): return urlresolve(url, settings=self.settings)
def _resolve(self, url): return urlresolve(url, master=self.master)
def indexurl(self, indexspec): resource = urlparse.urlparse(indexspec) if resource.netloc: return urlresolve(indexspec) path = "/indices/%s" % indexspec return urlparse.urlunparse(("http", self.netloc, path, "", "", ""))
def open_remote(url): conn = Connection(urlresolve(url)) return conn, len(conn), conn.url