def classic_iterator(urls, reader=task_io.chain_reader, input_stream=(func.map_input_stream, ), notifier=func.notifier, params=None, ddfs=None): """ An iterator over records as seen by the classic map interface. :type reader: :func:`disco.worker.task_io.input_stream` :param reader: shortcut for the last input stream applied. :type input_stream: sequence of :func:`disco.worker.task_io.input_stream` :param input_stream: used to read from a custom file format. :type notifier: :func:`disco.func.notifier` :param notifier: called when the task opens a url. """ from disco.worker import Input from disco.worker.classic.worker import Worker worker = Worker(map_reader=reader, map_input_stream=input_stream) settings = DiscoSettings(DISCO_MASTER=ddfs) if ddfs else DiscoSettings() for input in util.inputlist(urls, settings=settings): if isinstance(input, basestring): dest = proxy_url(input, to_master=False) elif isinstance(input, tuple): dest = tuple([proxy_url(i, to_master=False) for i in input]) else: dest = [proxy_url(i, to_master=False) for i in input] notifier(dest) for record in Input(dest, open=worker.opener('map', 'in', params)): yield record
def _download(self, url, data=None, token=None, method='GET'): return json.loads(download(self._resolve(proxy_url(url, proxy=self.proxy, meth=method)), data=data, method=method, token=self._token(url, token, method)))
def _upload(self, urls, source, token=None, to_master=True, **kwargs): urls = [self._resolve(proxy_url(url, proxy=self.proxy, meth='PUT', to_master=to_master)) for url in iterify(urls)] return upload(urls, source, token=self._token(url, token, 'PUT'), **kwargs)
def curl(replicas): for replica in replicas: try: return download(proxy_url(urlresolve(replica, master=program.ddfs.master), to_master=False)) except Exception, e: sys.stderr.write("%s\n" % e)
def result_iterator(results, notifier = None,\ proxy = None, reader = func.netstr_reader): res = [] for dir_url in results: if dir_url.startswith("dir://"): res += util.parse_dir(dir_url, proxy) else: res.append(dir_url) x, x, root = util.load_conf() for url in res: if url.startswith("file://"): fname = url[7:] fd = file(fname) sze = os.stat(fname).st_size elif url.startswith("disco://"): host, fname = url[8:].split("/", 1) url = util.proxy_url(proxy, fname, host) if util.resultfs_enabled: f = "%s/data/%s" % (root, fname) fd = file(f) sze = os.stat(f).st_size else: sze, fd = comm.open_remote(url) else: raise JobException("Invalid result url: %s" % url) if notifier: notifier(url) for x in reader(fd, sze, fname): yield x
def _download(self, url, data=None, token=None, method="GET", to_master=True): byts = download( self._resolve(proxy_url(url, proxy=self.proxy, meth=method, to_master=to_master)), data=data, method=method, token=self._token(url, token, method), ) return json.loads(bytes_to_str(byts))
def _download(self, url, data=None, token=None, method='GET', to_master=True): return json.loads(download(self._resolve(proxy_url(url, proxy=self.proxy, meth=method, to_master=to_master)), data=data, method=method, token=self._token(url, token, method)))
def request(self, url, data=None, offset=0): try: return download(proxy_url('%s%s' % (self.master, url), proxy=self.proxy), data=data, offset=offset) except CommError, e: if e.code == None: e.msg += " (is disco master running at %s?)" % self.master raise
def curl(replicas): for replica in replicas: try: return download(proxy_url(urlresolve(replica, master=program.ddfs.master), to_master=False)) except Exception as e: sys.stderr.write("{0}\n".format(e)) if not ignore_missing: raise Exception("Failed downloading all replicas: {0}".format(replicas)) return ''
def request(self, url, data=None, offset=0): try: byts = download(proxy_url('{0}{1}'.format(self.master, url), proxy=self.proxy), data=data, offset=offset) return byts.decode('utf-8') except CommError as e: if e.code == None: e.msg += " (is disco master running at {0}?)".format(self.master) raise
def pull(self, tag, blobfilter=lambda x: True, token=None): for repl in self.urls(tag, token=token): if blobfilter(self.blob_name(repl[0])): random.shuffle(repl) for url in repl: try: yield open_remote(proxy_url(url,proxy=self.proxy)) break except CommError, error: continue else: raise error
def result_iterator(urls, reader=task_io.chain_reader, input_stream=(func.map_input_stream, ), notifier=func.notifier, params=None, ddfs=None): """ An iterator over records stored in either disco or ddfs. :type reader: :func:`disco.worker.task_io.input_stream` :param reader: shortcut for the last input stream applied. :type input_stream: sequence of :func:`disco.worker.task_io.input_stream` :param input_stream: used to read from a custom file format. :type notifier: :func:`disco.func.notifier` :param notifier: called when the task opens a url. """ from disco.worker import Input from disco.worker.task_io import StreamCombiner settings = DiscoSettings(DISCO_MASTER=ddfs) if ddfs else DiscoSettings() for input in util.inputlist(urls, settings=settings): if isinstance(input, basestring): dest = proxy_url(input, to_master=False) elif isinstance(input, tuple): dest = tuple([proxy_url(i, to_master=False) for i in input]) else: dest = [proxy_url(i, to_master=False) for i in input] notifier(dest) def open(url): streams = [s for s in input_stream] if reader: streams += [reader] return StreamCombiner(url, streams, params) for record in Input(dest, open=open): yield record
def request(self, url, data=None, offset=0, as_bytes=False): try: byts = download(proxy_url('{0}{1}'.format(self.master, url), proxy=self.proxy), data=data, offset=offset) if as_bytes: return byts return byts.decode('utf-8') except CommError as e: if e.code == None: e.msg += " (is disco master running at {0}?)".format( self.master) raise
def pull(self, tag, blobfilter=lambda x: True, token=None): for repl in self.urls(tag, token=token): if blobfilter(self.blob_name(repl[0])): random.shuffle(repl) for url in repl: url = self._resolve( proxy_url(url, meth='GET', proxy=self.proxy, to_master=False)) try: yield open_remote(url) break except CommError, error: continue else: raise error
def pull(self, tag, blobfilter=lambda x: True, token=None): """ Iterate over the blobs in a ``tag`` after optionally applying a ``blobfilter`` over the blob names. """ comm_error = None for repl in self.urls(tag, token=token): if blobfilter(self.blob_name(repl[0])): random.shuffle(repl) for url in repl: url = self._resolve(proxy_url(url, meth="GET", proxy=self.proxy, to_master=False)) try: yield open_remote(url) break except CommError as error: comm_error = error continue else: raise comm_error
def pull(self, tag, blobfilter=lambda x: True, token=None): comm_error = None for repl in self.urls(tag, token=token): if blobfilter(self.blob_name(repl[0])): random.shuffle(repl) for url in repl: url = self._resolve( proxy_url(url, meth='GET', proxy=self.proxy, to_master=False) ) try: yield open_remote(url) break except CommError as error: comm_error = error continue else: raise comm_error
def xcat(program, *urls): """Usage: [urls ...] Concatenate the extracted results stored in url[s] and print to stdout. If any of the url[s] are tags, the blobs reachable from the tags will be printed after any non-tag url[s]. """ from itertools import chain from disco.core import classic_iterator from disco.util import iterify, reify, urlresolve, proxy_url tags, urls = program.separate_tags(*program.input(*urls)) stream = reify(program.options.stream) reader = program.options.reader reader = reify('disco.func.chain_reader' if reader is None else reader) bloburls = [[proxy_url(urlresolve(u), to_master=False) for u in repset] for repset in chain(urls, program.blobs(*tags))] for record in classic_iterator(bloburls, input_stream=stream, reader=reader): print '\t'.join('%s' % (e, ) for e in iterify(record)).rstrip()
def xcat(program, *urls): """Usage: [urls ...] Concatenate the extracted results stored in url[s] and print to stdout. If any of the url[s] are tags, the blobs reachable from the tags will be printed after any non-tag url[s]. """ from itertools import chain from disco.core import classic_iterator from disco.util import iterify, reify, urlresolve, proxy_url tags, urls = program.separate_tags(*program.input(*urls)) stream = reify(program.options.stream) reader = program.options.reader reader = reify('disco.func.chain_reader' if reader is None else reader) bloburls = [[proxy_url(urlresolve(u), to_master=False) for u in repset] for repset in chain(urls, program.blobs(*tags))] for record in classic_iterator(bloburls, input_stream=stream, reader=reader): print '\t'.join('%s' % (e,) for e in iterify(record)).rstrip()
def pull(self, tag, blobfilter=lambda x: True, token=None): """ Iterate over the blobs in a ``tag`` after optionally applying a ``blobfilter`` over the blob names. """ comm_error = None for repl in self.urls(tag, token=token): if blobfilter(self.blob_name(repl[0])): random.shuffle(repl) for url in repl: url = self._resolve( proxy_url(url, meth='GET', proxy=self.proxy, to_master=False)) try: yield open_remote(url) break except CommError as error: comm_error = error continue else: raise comm_error