Пример #1
0
    def __init__(self, *args, **kwargs):
        # load the defaults
        super(Settings, self).update(defaults)

        # override with the settings file
        path = kwargs.get('settings_file') or self['settings_file']
        if path and os.path.exists(path):
            try:
                import yaml
                self.update(yaml.load(open(path)))
            except:
                pass  # if ya can't ya can't

        # final overrides
        super(Settings, self).update(overrides)
        super(Settings, self).__init__(*args, **kwargs)

        # set up ddfs and disco
        if not self['server'].startswith('disco://'):
            self['server'] = 'disco://' + self['server']

        if 'ddfs' not in self:
            self['ddfs'] = DDFS(self['server'])
        self['server'] = Disco(self['server'])

        # set up worker
        if 'worker' not in self:
            worker_mod, _, worker_class = self['worker_class'].rpartition('.')
            mod = __import__(worker_mod, {}, {}, worker_mod)
            self['worker'] = getattr(mod, worker_class)()
Пример #2
0
Файл: util.py Проект: yuj/disco
def load_oob(host, name, key):
    from disco.ddfs import DDFS
    ddfs = DDFS(host)
    # NB: this assumes that blobs are listed in LIFO order.
    # We want to return the latest version
    for fd in ddfs.pull(ddfs.job_oob(name), blobfilter=lambda x: x == key):
        return fd.read()
def delete_all():
    '''
	Deletes all tags in DDFS, thus orphaning all blobs and making them subject to eventual removal by the garbage collector.
	'''
    ddfs = DDFS()
    for tag in ddfs.list():
        ddfs.delete(tag)
Пример #4
0
 def setUp(self):
     tag = 'disco:test:authjob'
     self.ddfs = DDFS(self.disco_master_url)
     pushed = self.ddfs.push(tag, [(StringIO('blobdata'), 'blob')])
     self.ddfs.setattr(tag, 'ddfs:read-token', 'r')
     self.input = ['tag://*****:*****@/' + tag]
     super(AuthJobTestCase, self).setUp()
Пример #5
0
 def setUp(self):
     self.ddfs = DDFS(self.disco_master_url)
     self.ddfs.push('disco:test:authrd', [(StringIO('datablob'), 'blobdata')])
     self.ddfs.push('disco:test:authwr', [(StringIO('datablob'), 'blobdata')])
     self.ddfs.setattr('disco:test:authrd', 'a', 'v')
     self.ddfs.setattr('disco:test:authwr', 'a', 'v')
     self.ddfs.setattr('disco:test:authrd', 'ddfs:read-token', 'rdr')
     self.ddfs.setattr('disco:test:authwr', 'ddfs:write-token', 'wtr')
Пример #6
0
 def setUp(self):
     self.ddfs = DDFS(self.disco_master_url)
     self.ddfs.push('disco:test:blobs', [(StringIO('datablob'), 'blobdata')])
     self.ddfs.push('disco:test:blobs', [(StringIO('datablob2'), 'blobdata2')])
     self.ddfs.push('disco:test:emptyblob', [(StringIO(''), 'empty')])
     self.ddfs.tag('disco:test:tag', [['urls']])
     self.ddfs.tag('disco:test:metatag',
                   [['tag://disco:test:tag'], ['tag://disco:test:metatag']])
Пример #7
0
def ddfs_save(blobs, name, master):
    from disco.ddfs import DDFS
    ddfs = DDFS(master)
    blobs = [(blob, ('discoblob:%s:%s' % (name, os.path.basename(blob))))
             for blob in blobs]
    tag = ddfs_name(name)
    ddfs.push(tag, blobs, retries=600, delayed=True, update=True)
    return "tag://%s" % tag
Пример #8
0
def get_disco_handle(server):
    from disco.core import Disco
    from disco.ddfs import DDFS

    if server and not server.startswith('disco://'):
        server = 'disco://' + server

    return Disco(server), DDFS(server)
Пример #9
0
def main(file_in="iris.csv", file_out="centers.csv", n_clusters=3):
    # TODO: Rename tag data:kcluster1 if tag exists.
    # Disco v0.4.4 requires that ./ prefix the file to idendify as local file.
    # http://disco.readthedocs.org/en/0.4.4/howto/chunk.html#chunking
    tag = "data:sort"
    DDFS().chunk(tag=tag, urls=['./'+file_in])
    try:
        # Import since slave nodes do not have same namespace as master
        from kcluster_map_reduce import KCluster
        job = KCluster().run(input=[tag], map_reader=chain_reader)
        with open(file_out, 'w') as f_out:
            writer = csv.writer(f_out, quoting=csv.QUOTE_NONNUMERIC)
            for center in result _iterator(job.wait(show=True)):
                writer.writerow([center])
    finally:
        DDFS().delete(tag=tag)
    return None
Пример #10
0
    def save_outputs(self, jobname, master=None):
        from disco.ddfs import DDFS

        def paths():
            for output in self.outputs.values():
                output.file.close()
                yield output.path

        self.send('OUTPUT', [DDFS(master).save(jobname, paths()), 'tag'])
def push_by_tag(file_paths, tag=None):
    '''
	'''
    ddfs = DDFS()
    if tag is None:
        for file_path in file_paths:
            tag = file_path.split("/")[-1].split(".")[0]
            ddfs.push(tag, [file_path])
    else:
        ddfs.push(tag, file_paths)
Пример #12
0
def inputexpand(input, label=None, settings=DiscoSettings()):
    from disco.ddfs import DDFS, istag
    if ispartitioned(input) and label is not False:
        return zip(*(parse_dir(i, label=label) for i in iterify(input)))
    if isiterable(input):
        return [inputlist(input, label=label, settings=settings)]
    if istag(input):
        ddfs = DDFS(settings=settings)
        return chainify(blobs for name, tags, blobs in ddfs.findtags(input))
    return [input]
Пример #13
0
def load(file_in, tag):
    """
    Load file into Disco.
    """

    # If Disco tag exists, delete it.
    # Don't add all-new data to an already existing tag.
    if DDFS().exists(tag=tag):
        print("WARNING: Overwriting Disco tag {tag}.".format(tag=tag),
              file=sys.stderr)
        DDFS().delete(tag=tag)

    # Load data into Disco Distributed File System.
    print("Loading into Disco:\n{file_in}\nunder tag\n{tag}".format(
        file_in=file_in, tag=tag))
    try:
        DDFS().chunk(tag=tag, urls=[os.path.join('./', file_in)])
    except ValueError as err:
        print("ValueError: " + err.message, file=sys.stderr)
        print("File: {file_in}".format(file_in=file_in), file=sys.stderr)

    return None
Пример #14
0
def urllist(url, partid=None, listdirs=True, ddfs=None):
    from disco.ddfs import DDFS, istag
    if istag(url):
        token = auth_token(url)
        ret = []
        for name, tags, blobs in DDFS(ddfs).findtags(url, token=token):
            ret += blobs
        return ret
    if isiterable(url):
        return [list(url)]
    scheme, netloc, path = urlsplit(url)
    if scheme == 'dir' and listdirs:
        return parse_dir(url, partid=partid)
    return [url]
Пример #15
0
def main_load(args):
    """
    Stage of main function for loading individual files:
    - Download bz2 file if it doesn't exist.
    - Decompress and partition bz2 file if it doesn't exist.
    - Load data into Disco Distributed File System if it doesn't exist.
    """
    df_bz2urls_filetags = args.df_concat.dropna(subset=['bz2url', 'filetag'])
    # Download bz2 file if it doesn't exist.
    # TODO: parallelize, see "programming python" on threads
    # quick hack: use Popen with wget to download
    for (idx, bz2url, filetag) in df_bz2urls_filetags[['bz2url', 'filetag'
                                                       ]].itertuples():
        fbz2 = os.path.join(args.data_dir, os.path.basename(bz2url))
        if os.path.isfile(fbz2):
            if args.verbose >= 2:
                print(("INFO: Skipping download. File already exists:\n {fbz2}"
                       ).format(fbz2=fbz2))
        else:
            if args.verbose >= 1:
                print(("INFO: Downloading:\n {url}\n to:\n {fout}").format(
                    url=bz2url, fout=fbz2))

            try:
                download(url=bz2url, fout=fbz2)
            except:
                ErrMsg().eprint(err=sys.exc_info())
    # Decompress and partition bz2 file if it doesn't exist.
    # TODO: parallelize, see "programing python" on threads
    # quick hack: use Popen with "bunzip2 --keep" and "grep -oE '.{1,1000}' fname" to partition
    for (idx, bz2url, filetag) in df_bz2urls_filetags[['bz2url', 'filetag'
                                                       ]].itertuples():
        fbz2 = os.path.join(args.data_dir, os.path.basename(bz2url))
        fdecom = os.path.splitext(fbz2)[0]
        if os.path.isfile(fdecom):
            if args.verbose >= 2:
                print(
                    ("INFO: Skipping decompress and partition." +
                     " File already exists:\n {fdecom}").format(fdecom=fdecom))
        else:
            if args.verbose >= 1:
                print(("INFO: Decompressing and partitioning:\n" +
                       " {fbz2}\n to:\n {fout}").format(fbz2=fbz2,
                                                        fout=fdecom))
            try:
                decom_part(fbz2=fbz2, fout=fdecom)
            except:
                ErrMsg().eprint(err=sys.exc_info())
    # Load data into Disco Distributed File System if it doesn't exist.
    cmds = []
    for (idx, bz2url, filetag) in df_bz2urls_filetags[['bz2url', 'filetag'
                                                       ]].itertuples():
        fbz2 = os.path.join(args.data_dir, os.path.basename(bz2url))
        fdecom = os.path.splitext(fbz2)[0]
        if DDFS().exists(tag=filetag):
            if args.verbose >= 2:
                print(("INFO: Skipping Disco upload." +
                       " Tag already exists:\n {tag}.").format(tag=filetag))
        else:
            if args.verbose >= 1:
                print(("INFO: Loading into Disco:\n" +
                       " {fdecom}\n under tag:\n {tag}").format(fdecom=fdecom,
                                                                tag=filetag))
            cmd = ("ddfs chunk {tag} {url}").format(tag=filetag,
                                                    url=os.path.join(
                                                        './', fdecom))
            cmds.append(cmd)
            # TODO: parallelize using Python API rather than system, see "programming python" on threads
            # try: DDFS().chunk(tag=filetag, urls=[os.path.join('./', fdecom)])
    try:
        processes = [Popen(cmd, shell=True) for cmd in cmds]
        for proc in processes:
            proc.wait()
    except:
        ErrMsg().eprint(err=sys.exc_info())
    return None
Пример #16
0
Файл: cli.py Проект: tpeng/disco
 def ddfs(self):
     from disco.ddfs import DDFS
     return DDFS(settings=self.settings)
Пример #17
0
    else:
        # print url, rest
        fle = util.localize(rest,
                            disco_data=worker.Task.disco_data,
                            ddfs_data=worker.Task.ddfs_data)

        yield url, fle


def copy_tags_map((url, local_file), params):
    from disco.ddfs import DDFS
    from disco.comm import request
    from tempfile import NamedTemporaryFile
    from socket import gethostname
    try:
        ddfs = DDFS(params.target_disco_master)
        if params.chunk:
            ddfs.chunk(params.target_tag, [local_file])
        else:
            ddfs.push(params.target_tag, [local_file])
        print "pushed local: %s" % local_file
    except Exception as e:
        # we couldn't push the local file for whatever reason, let's try downloading the URL, then pushing
        try:
            blob_req = request('GET', url)
            with NamedTemporaryFile("w", delete=True) as fd:
                fd.write(blob_req.read())
                fd.flush()
                ddfs = DDFS(params.target_disco_master)
                if params.chunk:
                    ddfs.chunk(params.target_tag, [fd.name])
Пример #18
0
def save_oob(host, name, key, value, ddfs_token=None):
    from disco.ddfs import DDFS
    DDFS(host).push(ddfs_oobname(name), [(StringIO(value), key)], delayed=True)
Пример #19
0
 def ddfs(self):
     from disco.ddfs import DDFS
     return DDFS(master=self.settings['DISCO_MASTER'])
Пример #20
0
def writetoken(program, tag, tok):
    """Usage: [-t token] tag token

    Set the write token of a tag.
    """
    program.ddfs.setattr(tag, 'ddfs:write-token', tok, token=program.options.token)

@DDFS.command
def xcat(program, *urls):
    """Usage: [-i] [-p] [-R reader] [-t token] [urls ...]

    Concatenate the extracted results stored in url[s] and print to stdout.
    If any of the url[s] are tags,
    the blobs reachable from the tags will be printed after any non-tag url[s].
    """
    from disco.core import RecordIter
    from disco.util import iterify, reify

    tags, urls = program.separate_tags(*urls)
    stream = reify(program.options.stream)
    reader = program.options.reader
    reader = reify('disco.func.chain_reader' if reader is None else reader)

    for record in RecordIter(chain(urls, program.blobs(*tags)),
                             input_stream=stream,
                             reader=reader):
        print '\t'.join('%s' % (e,) for e in iterify(record)).rstrip()

if __name__ == '__main__':
    DDFS(option_parser=DDFSOptionParser()).main()
Пример #21
0
 def setUp(self):
     self.ddfs = DDFS(self.disco_master_url)
     self.ddfs.push('disco:test:attrs', [(StringIO('datablob'), 'blobdata')])
     self.ddfs.setattr('disco:test:attrs', 'a1', 'v1')
     self.ddfs.setattr('disco:test:attrs', 'a2', 'v2')
Пример #22
0
 def setUp(self):
     self.ddfs = DDFS(self.disco_master_url)
Пример #23
0
from discodex import settings
from discodex.mapreduce import (Indexer, DiscoDBIterator)
from discodex.objects import (DataSet, IChunks, Indices, Index, Results, Dict)

from disco.core import Disco
from disco.ddfs import DDFS
from disco.error import DiscoError
from disco.util import flatten, parse_dir

discodex_settings = settings.DiscodexSettings()
disco_master_url = discodex_settings['DISCODEX_DISCO_MASTER']
disco_prefix = discodex_settings['DISCODEX_DISCO_PREFIX']
index_prefix = discodex_settings['DISCODEX_INDEX_PREFIX']
purge_file = discodex_settings['DISCODEX_PURGE_FILE']
disco_master = Disco(disco_master_url)
ddfs = DDFS(disco_master_url)

NOT_FOUND, OK, ACTIVE, DEAD = 'unknown job', 'ready', 'active', 'dead'


class IndexCollection(Collection):
    allowed_methods = ('GET', 'POST')

    def delegate(self, request, *args, **kwargs):
        name = str(kwargs.pop('name'))
        return IndexResource(name)(request, *args, **kwargs)

    @property
    def names(self):
        return ddfs.list(index_prefix)
Пример #24
0
 def ddfs(self):
     return DDFS(settings=self.settings)
Пример #25
0
 def setUp(self):
     self.d = DDFS()
     wait_for_gc_to_finish(self.d)
     with open(FILE, 'w') as f:
         print >> f, "hello world!"
Пример #26
0
def main_sets(args):
    """
    Stage of main function for packing individual files into data sets.
    - Sort filetags by size in descending order.
    - Add filetags to a dataset as long as they can fit.
    - Label the dataset with the actual dataset size.
    - Append data to settag from filetags in DDFS.
    - Note: Must have all 'filetag' loaded.
    """
    df_bz2urls_filetags = args.df_concat.dropna(subset=['bz2url', 'filetag'])
    bytes_per_gb = 10**9
    filetag_sizegb_map = {}
    # If it exists, use checked, downloaded data from Disco to verify dataset sizes,
    # otherwise use decompressed files prior to Disco upload.
    if args.check_filetags:
        # idx variables are unused.
        for (idx, bz2url,
             filetag) in df_bz2urls_filetags[['bz2url',
                                              'filetag']].itertuples():
            ftag = os.path.join(args.data_dir, filetag + '.txt')
            ftag_sizegb = os.path.getsize(ftag) / bytes_per_gb
            filetag_sizegb_map[filetag] = ftag_sizegb
    else:
        # idx variables are unused.
        for (idx, bz2url,
             filetag) in df_bz2urls_filetags[['bz2url',
                                              'filetag']].itertuples():
            fbz2 = os.path.join(args.data_dir, os.path.basename(bz2url))
            fdecom = os.path.splitext(fbz2)[0]
            fdecom_sizegb = os.path.getsize(fdecom) / bytes_per_gb
            filetag_sizegb_map[filetag] = fdecom_sizegb
    # Sort filetags by size in descending order.
    # Add filetags to a dataset as long as they can fit. Nest the data sets.
    filetag_sizegb_sorted = sorted(filetag_sizegb_map.iteritems(),
                                   key=operator.itemgetter(1),
                                   reverse=True)
    settag_filetags_map = {}
    is_first = True
    for size in sorted(args.sets_gb):
        filetags = []
        tot = 0.
        res = size
        # Include smaller data sets in the next larger dataset.
        if not is_first:
            filetags.extend(settag_filetags_map[prev_settag])
            tot += prev_tot
            res -= prev_tot
        for (filetag, sizegb) in filetag_sizegb_sorted:
            if (sizegb <= res) and (filetag not in filetags):
                filetags.append(filetag)
                tot += sizegb
                res -= sizegb
        # Label the dataset with the actual dataset size.
        # Note: Disco tags must have character class [A-Za-z0-9_\-@:]+ else get CommError.
        settag = ("{tot:.2f}GB".format(tot=tot)).replace('.', '-')
        settag_filetags_map[settag] = filetags
        # Include the smaller data set in the next larger dataset.
        prev_tot = tot
        prev_settag = settag
        is_first = False
    # Append data to settag from filetags in DDFS.
    # TODO: use logging.
    for settag in sorted(settag_filetags_map):
        if DDFS().exists(tag=settag):
            if args.verbose >= 2:
                print(("INFO: Skipping Disco upload." +
                       " Tag already exists:\n {tag}.").format(tag=settag))
        else:
            if args.verbose >= 1:
                print(
                    ("INFO: Appending data to settag from filetags:\n" +
                     " {settag}\n" + " {filetags}").format(
                         settag=settag, filetags=settag_filetags_map[settag]))
            for filetag in settag_filetags_map[settag]:
                try:
                    filetag_urls = DDFS().urls(filetag)
                    DDFS().tag(settag, filetag_urls)
                except:
                    ErrMsg().eprint(err=sys.exc_info())
    return None
Пример #27
0
Файл: util.py Проект: yuj/disco
def save_oob(host, name, key, value, ddfs_token=None):
    from disco.ddfs import DDFS
    DDFS(host).push(DDFS.job_oob(name), [(BytesIO(value), key)], delayed=True)
Пример #28
0
 def _map_input_stream(fd, size, url, params):
     from disco.ddfs import DDFS
     tag = params or 'disco:chunks:%s' % Task.jobname
     yield url, DDFS(Task.master).chunk(tag, [url])
Пример #29
0
 def ddfs(self):
     from disco.ddfs import DDFS
     return DDFS(self.master)
Пример #30
0
 def _map_input_stream(fd, size, url, params):
     from disco.ddfs import DDFS
     from disco.func import gzip_line_reader
     tag = params or 'disco:chunks:%s' % Task.jobname
     yield urlo, DDFS(Task.master).chunk(tag, [url], reader=gzip_line_reader)