Пример #1
0
def delete_all():
	'''
	Deletes all tags in DDFS, thus orphaning all blobs and making them subject to eventual removal by the garbage collector.
	'''
	ddfs = DDFS()
	for tag in ddfs.list():
		ddfs.delete(tag)
Пример #2
0
def push_data(self, args):
  path = args['path']
  ds_id = args['ds_id']

  filename = os.path.basename(path)
  tmp_dir = str(int(time.time()))

  # Create temporary files
  os.chdir(config.DISCO_FILES)
  os.makedirs(tmp_dir)
  copy2(filename, "%s/%s" % (tmp_dir, filename))
  os.chdir(tmp_dir)

  command = 'split -n %s %s' % (config.DISCO_NODES, path)
  split_process = Popen(command.split(' '), stdout=PIPE)
  split_process.communicate()

  # Push data to cluster
  command = 'ddfs push data:%s ./xa?' % ds_id
  d = DDFS('disco://localhost')
  files = [("%s/%s/%s" % (config.DISCO_FILES, tmp_dir, filename), filename) for filename in os.listdir(".") if filename.startswith("xa")]
  d.push('data:%s' % ds_id, files)

  r.table('datasets').filter({
      'id': ds_id,
  }).update({
      'state': 'ready_for_crunching'
  }).run(db)
Пример #3
0
def load_oob(host, name, key):
    from disco.ddfs import DDFS
    ddfs = DDFS(host)
    # NB: this assumes that blobs are listed in LIFO order.
    # We want to return the latest version
    for fd in ddfs.pull(ddfs.job_oob(name), blobfilter=lambda x: x == key):
        return fd.read()
Пример #4
0
def ddfs_save(blobs, name, master):
    from disco.ddfs import DDFS
    ddfs = DDFS(master)
    blobs = [(blob, ('discoblob:%s:%s' % (name, os.path.basename(blob))))
             for blob in blobs]
    tag = ddfs_name(name)
    ddfs.push(tag, blobs, retries=600, delayed=True, update=True)
    return "tag://%s" % tag
Пример #5
0
def push_by_tag(file_paths,tag=None):
	'''
	'''
	ddfs = DDFS()
	if tag is None:
		for file_path in file_paths:
			tag = file_path.split("/")[-1].split(".")[0]
			ddfs.push(tag,[file_path])
	else:
		ddfs.push(tag,file_paths)
Пример #6
0
def inputexpand(input, partition=None, settings=DiscoSettings()):
    from disco.ddfs import DDFS, istag
    if ispartitioned(input) and partition is not False:
        return zip(*(parse_dir(i, partition=partition) for i in iterify(input)))
    if isiterable(input):
        return [inputlist(input, partition=partition, settings=settings)]
    if istag(input):
        ddfs = DDFS(settings=settings)
        return chainify(blobs for name, tags, blobs in ddfs.findtags(input))
    return [input]
Пример #7
0
 def __init__(self, docset_name):
     self.ddfs_tag = docset_name
     self.ddfs_index_tag = docset_name + ':index'
     self.ddfs_link_file_tag = docset_name + ':links'
     self.ddfs = DDFS()
     self.__index = None
     self.dirty = False
Пример #8
0
def push_by_tag(file_paths, tag=None):
    '''
    '''
    ddfs = DDFS()
    if tag is None:
        for file_path in file_paths:
            tag = os.path.splitext(ntpath.basename(file_path))[0]
            try:
                ddfs.push(tag, [file_path])
            except IOError:
                print("Invalid file path specified.")
    else:
        try:
            ddfs.push(tag, file_paths)
        except IOError:
            print("Invalid file path specified.")
Пример #9
0
 def setUp(self):
     tag = 'disco:test:authjob'
     self.ddfs = DDFS(self.disco_master_url)
     pushed = self.ddfs.push(tag, [(StringIO('blobdata'), 'blob')])
     self.ddfs.setattr(tag, 'ddfs:read-token', 'r')
     self.input = ['tag://*****:*****@/' + tag]
     super(AuthJobTestCase, self).setUp()
Пример #10
0
 def setUp(self):
     self.ddfs = DDFS(self.disco_master_url)
     self.ddfs.push('disco:test:authrd', [(StringIO('datablob'), 'blobdata')])
     self.ddfs.push('disco:test:authwr', [(StringIO('datablob'), 'blobdata')])
     self.ddfs.setattr('disco:test:authrd', 'a', 'v')
     self.ddfs.setattr('disco:test:authwr', 'a', 'v')
     self.ddfs.setattr('disco:test:authrd', 'ddfs:read-token', 'rdr')
     self.ddfs.setattr('disco:test:authwr', 'ddfs:write-token', 'wtr')
Пример #11
0
 def setUp(self):
     self.ddfs = DDFS(self.disco_master_url)
     self.ddfs.push('disco:test:blobs', [(StringIO('datablob'), 'blobdata')])
     self.ddfs.push('disco:test:blobs', [(StringIO('datablob2'), 'blobdata2')])
     self.ddfs.push('disco:test:emptyblob', [(StringIO(''), 'empty')])
     self.ddfs.tag('disco:test:tag', [['urls']])
     self.ddfs.tag('disco:test:metatag',
                   [['tag://disco:test:tag'], ['tag://disco:test:metatag']])
Пример #12
0
 def put(self, key, value):
     """
     Stores an out-of-band result *value* with the key *key*. Key must be unique in
     this job. Maximum key length is 256 characters. Only characters in the set
     ``[a-zA-Z_\-:0-9@]`` are allowed in the key.
     """
     if DDFS.safe_name(key) != key:
         raise DiscoError("OOB key contains invalid characters (%s)" % key)
     util.save_oob(self.master, self.jobname, key, value)
Пример #13
0
class AuthJobTestCase(DiscoJobTestFixture, DiscoTestCase):
    input = []

    @staticmethod
    def map(e, params):
        return [(e.strip(), '')]

    @property
    def answers(self):
        return [('blobdata', '')]

    def setUp(self):
        tag = 'disco:test:authjob'
        self.ddfs = DDFS(self.disco_master_url)
        pushed = self.ddfs.push(tag, [(StringIO('blobdata'), 'blob')])
        self.ddfs.setattr(tag, 'ddfs:read-token', 'r')
        self.input = ['tag://*****:*****@/' + tag]
        super(AuthJobTestCase, self).setUp()

    def tearDown(self):
        super(AuthJobTestCase, self).tearDown()
        self.ddfs.delete('disco:test:authjob')
Пример #14
0
    def __tag_results(self, results):
        from disco.ddfs import DDFS
        ddfs = DDFS()
        results_tag = results[0]
        ddfs.put(self.docset.ddfs_link_file_tag, list(ddfs.blobs(results_tag)))

        # remove old, temporary tag
        ddfs.delete(results_tag)
Пример #15
0
    def put(self, key, value):
        """
        Stores an out-of-band result *value* (bytes) with the key *key*.

        Key must be unique in this job.
        Maximum key length is 256 characters.
        Only characters in the set ``[a-zA-Z_\-:0-9@]`` are allowed in the key.
        """
        from disco.ddfs import DDFS
        from disco.util import save_oob
        from disco.error import DiscoError
        if DDFS.safe_name(key) != key:
            raise DiscoError("OOB key contains invalid characters ({0})".format(key))
        save_oob(self.master, self.jobname, key, value)
Пример #16
0
 def __init__(self,
              host='',
              jobfile='',
              jobname='',
              master=None,
              disco_port=None,
              put_port=None,
              ddfs_data='',
              disco_data='',
              stage=None,
              group=None,
              grouping=None,
              taskid=-1):
     from disco.job import JobPack
     from disco.ddfs import DDFS
     self.host = host
     self.jobfile = jobfile
     self.jobname = jobname
     self.jobpack = JobPack.load(open(jobfile, 'rb'))
     self.jobobjs = dPickle.loads(self.jobpack.jobdata)
     self.master = master
     self.disco_port = disco_port
     self.put_port = put_port
     self.ddfs_data = ddfs_data
     self.disco_data = disco_data
     self.stage = stage
     self.group = '{0[0]}-{0[1]}'.format(group)
     self.group_label, self.group_host = group
     self.grouping = grouping
     self.taskid = taskid
     self.outputs = {}
     self.uid = '{0}:{1}-{2}-{3}-{4}'.format(self.stage,
                                             DDFS.safe_name(self.group),
                                             self.taskid,
                                             hexhash(str((time.time())).encode()),
                                             os.getpid())
Пример #17
0
 def tearDown(self):
     super(SaveTestCase, self).tearDown()
     DDFS(self.disco_master_url).delete(ddfs_name(self.job_1.name))
     DDFS(self.disco_master_url).delete(ddfs_name(self.job_2.name))
Пример #18
0
 def blobnames(self, tag):
     from disco.ddfs import DDFS
     return list(reversed(list(DDFS.blob_name(repl[0])
                               for repl in self.ddfs.blobs(tag))))
Пример #19
0
 def setUp(self):
     self.ddfs = DDFS(self.disco_master_url)
     self.ddfs.push('disco:test:attrs', [(StringIO('datablob'), 'blobdata')])
     self.ddfs.setattr('disco:test:attrs', 'a1', 'v1')
     self.ddfs.setattr('disco:test:attrs', 'a2', 'v2')
Пример #20
0
class DDFSWriteTestCase(DiscoTestCase):
    def setUp(self):
        self.ddfs = DDFS(self.disco_master_url)

    def test_push(self):
        from cStringIO import StringIO
        self.ddfs.push('disco:test:blobs', [(StringIO('blobdata'), 'blobdata')])
        self.assert_(self.ddfs.exists('disco:test:blobs'))
        self.ddfs.push('tag://disco:test:blobs2', [(StringIO('blobdata'), 'blobdata')])
        self.assert_(self.ddfs.exists('disco:test:blobs2'))
        self.ddfs.delete('disco:test:blobs')
        self.assert_(not self.ddfs.exists('disco:test:blobs'))
        self.ddfs.delete('disco:test:blobs2')
        self.assert_(not self.ddfs.exists('disco:test:blobs2'))

    def test_tag(self):
        self.ddfs.tag('disco:test:tag', [['urls']])
        self.assert_(self.ddfs.exists('disco:test:tag'))
        self.ddfs.delete('disco:test:tag')
        self.assert_(not self.ddfs.exists('disco:test:tag'))
        self.ddfs.tag('tag://disco:test:tag', [['urls']])
        self.assert_(self.ddfs.exists('tag://disco:test:tag'))
        self.ddfs.tag('disco:test:tag', [['more_urls']])
        self.assertEquals(sorted(self.ddfs.get('disco:test:tag')['urls']),
                          sorted([['urls'], ['more_urls']]))
        self.ddfs.delete('tag://disco:test:tag')
        self.assert_(not self.ddfs.exists('tag://disco:test:tag'))

    def test_put(self):
        self.ddfs.put('disco:test:tag', [['urls']])
        self.assert_(self.ddfs.exists('disco:test:tag'))
        self.assertEquals(self.ddfs.get('disco:test:tag')['urls'], [['urls']])
        self.ddfs.put('disco:test:tag', [['tags']])
        self.assertEquals(self.ddfs.get('disco:test:tag')['urls'], [['tags']])
        self.ddfs.delete('tag://disco:test:tag')

    def test_delete(self):
        self.ddfs.delete('disco:test:notag')
Пример #21
0
 def setUp(self):
     self.ddfs = DDFS(self.disco_master_url)
Пример #22
0
class DDFSAuthTestCase(DiscoTestCase):
    def setUp(self):
        self.ddfs = DDFS(self.disco_master_url)
        self.ddfs.push('disco:test:authrd', [(StringIO('datablob'), 'blobdata')])
        self.ddfs.push('disco:test:authwr', [(StringIO('datablob'), 'blobdata')])
        self.ddfs.setattr('disco:test:authrd', 'a', 'v')
        self.ddfs.setattr('disco:test:authwr', 'a', 'v')
        self.ddfs.setattr('disco:test:authrd', 'ddfs:read-token', 'rdr')
        self.ddfs.setattr('disco:test:authwr', 'ddfs:write-token', 'wtr')

    def test_write_noread(self):
        self.assertEquals(self.ddfs.getattr('disco:test:authwr', 'a'), 'v')
        self.assertEquals(self.ddfs.getattr('disco:test:authwr', 'a', token='rand'), 'v')

    def test_write_noread2(self):
        self.assertCommErrorCode(401, lambda: self.ddfs.setattr('disco:test:authwr', 'a2', 'v2'))
        rand_setter = lambda: self.ddfs.setattr('disco:test:authwr', 'a2', 'v2', token='rand')
        self.assertCommErrorCode(401, rand_setter)
        self.ddfs.setattr('disco:test:authwr', 'a2', 'v2', token='wtr')
        self.ddfs.delattr('disco:test:authwr', 'a2', token='wtr')

    def test_write_noread3(self):
        setter = lambda: self.ddfs.setattr('disco:test:authwr', 'ddfs:read-token', 'r')
        self.assertCommErrorCode(401, setter)
        self.ddfs.setattr('disco:test:authwr', 'ddfs:read-token', 'r', token='wtr')
        self.assertCommErrorCode(401, lambda: self.ddfs.getattr('disco:test:authwr', 'a'))
        self.assertEquals(self.ddfs.getattr('disco:test:authwr', 'a', token='r'), 'v')
        self.ddfs.delattr('disco:test:authwr', 'ddfs:read-token', token='wtr')

    def test_read_nowrite(self):
        self.assertCommErrorCode(401, lambda: self.ddfs.getattr('disco:test:authrd', 'a'))
        rand_getter = lambda: self.ddfs.getattr('disco:test:authrd', 'a', token='rand')
        self.assertCommErrorCode(401, rand_getter)
        self.assertEquals(self.ddfs.getattr('disco:test:authrd', 'a', token='rdr'), 'v')

    def test_read_nowrite2(self):
        self.ddfs.setattr('disco:test:authrd', 'a2', 'v2')
        self.assertEquals(self.ddfs.getattr('disco:test:authrd', 'a2', token='rdr'), 'v2')
        self.ddfs.delattr('disco:test:authrd', 'a2', token='rand')

    def test_read_nowrite3(self):
        self.ddfs.setattr('disco:test:authrd', 'ddfs:read-token', 'r')
        self.ddfs.setattr('disco:test:authrd', 'ddfs:read-token', 'rdr')

    def test_atomic_token(self):
        self.ddfs.push('disco:test:atomic1',
                        [(StringIO('abc'), 'atom')],
                        update=True,
                        delayed=True,
                        token='secret1')
        getter = lambda: self.ddfs.getattr('disco:test:atomic1', 'foobar')
        self.assertCommErrorCode(401, getter)
        self.assertEquals(self.ddfs.getattr('disco:test:atomic1',
                                            'ddfs:write-token',
                                            token='secret1'), 'secret1')
        self.ddfs.put('disco:test:atomic2', [], token='secret2')
        getter = lambda: self.ddfs.getattr('disco:test:atomic2', 'foobar')
        self.assertCommErrorCode(401, getter)
        self.assertEquals(self.ddfs.getattr('disco:test:atomic2',
                                            'ddfs:write-token',
                                            token='secret2'), 'secret2')
        self.ddfs.put('disco:test:notoken', [])
        self.assertEquals(self.ddfs.getattr('disco:test:notoken',
                                            'ddfs:write-token'), None)

    def tearDown(self):
        self.ddfs.delete('disco:test:authrd')
        self.ddfs.delete('disco:test:authwr', token='wtr')
        self.ddfs.delete('disco:test:atomic1', token='secret1')
        self.ddfs.delete('disco:test:atomic2', token='secret2')
        self.ddfs.delete('disco:test:notoken')
Пример #23
0
 def blobnames(self, tag):
     from disco.ddfs import DDFS
     return list(reversed(list(DDFS.blob_name(repl[0])
                               for repl in self.ddfs.blobs(tag))))
Пример #24
0
class DdfsGcTests(TestCase):
    def setUp(self):
        self.d = DDFS()
        wait_for_gc_to_finish(self.d)
        with open(FILE, 'w') as f:
            print >>f, "hello world!"

    def _test_push(self, prefix, func):
        for i in range(COUNT):
            func(prefix + str(i), [FILE])
        self.d._download(self.d.master + "/ddfs/ctrl/gc_start")

        wait_for_gc_to_finish(self.d)
        for i in range(COUNT):
            blobs = [b for b in self.d.blobs(prefix + str(i))]
            self.assertEquals(len(blobs), 1)
            self.assertGreater(len(blobs[0]), 0)

    def test_push_deterministic(self):
        self._test_push(PREFIX + str(uuid1()), self.d.push)

    def test_push_same_tag(self):
        self._test_push(PREFIX, self.d.push)

    def test_chunk_deterministic(self):
        self._test_push(PREFIX + str(uuid1()), self.d.chunk)

    def test_chunk_same_tag(self):
        self._test_push(PREFIX, self.d.chunk)

    def test_chunk_delayed(self):
        self._test_push(PREFIX, partial(self.d.chunk, delayed=True))

    def test_push_delayed(self):
        self._test_push(PREFIX, partial(self.d.push, delayed=True))

    def test_chunk_none_replicas(self):
        self._test_push(PREFIX, partial(self.d.chunk, replicas=None))

    def _test_func_tag(self, prefix, func):
        def chunk_tag(name, input):
            _, blob_set = func(name, input)
            self.d.tag(name + "tag", blob_set)
        self._test_push(PREFIX, chunk_tag)

        for i in range(COUNT):
            blobs = [b for b in self.d.blobs(prefix + str(i) + "tag")]
            self.assertEquals(len(blobs), 1)
            self.assertGreater(len(blobs[0]), 0)

    def test_chunk_tag(self):
        self._test_func_tag(PREFIX, self.d.chunk)

    def test_chunk_tag_delayed(self):
        self._test_func_tag(PREFIX, partial(self.d.chunk, delayed=True))

    def test_push_tag(self):
        self._test_func_tag(PREFIX, self.d.push)

    def test_push_tag_delayed(self):
        self._test_func_tag(PREFIX, partial(self.d.push, delayed=True))

    def tearDown(self):
        tags = self.d.list(PREFIX)
        for tag in tags:
            self.d.delete(tag)
Пример #25
0
class DDFSWriteTestCase(DiscoTestCase):
    def setUp(self):
        self.ddfs = DDFS(self.disco_master_url)

    def test_chunk(self):
        from disco.core import RecordIter
        url = 'http://discoproject.org/media/text/chekhov.txt'
        self.ddfs.chunk('disco:test:chunk', [url], chunk_size=100*1024)
        self.assert_(0 < len(list(self.ddfs.blobs('disco:test:chunk'))) <= 4)
        self.assert_(list(RecordIter(['tag://disco:test:chunk'])),
                     list(RecordIter([url], reader=None)))
        self.ddfs.delete('disco:test:chunk')

    def test_push(self):
        self.ddfs.push('disco:test:blobs', [(StringIO('blobdata'), 'blobdata')])
        self.assert_(self.ddfs.exists('disco:test:blobs'))
        self.ddfs.push('tag://disco:test:blobs2', [(StringIO('blobdata'), 'blobdata')])
        self.assert_(self.ddfs.exists('disco:test:blobs2'))
        self.ddfs.delete('disco:test:blobs')
        self.assert_(not self.ddfs.exists('disco:test:blobs'))
        self.ddfs.delete('disco:test:blobs2')
        self.assert_(not self.ddfs.exists('disco:test:blobs2'))

    def test_tag(self):
        self.ddfs.tag('disco:test:tag', [['urls']])
        self.assert_(self.ddfs.exists('disco:test:tag'))
        self.ddfs.delete('disco:test:tag')
        self.assert_(not self.ddfs.exists('disco:test:tag'))
        self.ddfs.tag('tag://disco:test:tag', [['urls']])
        self.assert_(self.ddfs.exists('tag://disco:test:tag'))
        self.ddfs.tag('disco:test:tag', [['more_urls']])
        self.assertEquals(sorted(self.ddfs.get('disco:test:tag')['urls']),
                          sorted([['urls'], ['more_urls']]))
        self.ddfs.delete('tag://disco:test:tag')
        self.assert_(not self.ddfs.exists('tag://disco:test:tag'))

    def test_put(self):
        self.ddfs.put('disco:test:tag', [['urls']])
        self.assert_(self.ddfs.exists('disco:test:tag'))
        self.assertEquals(self.ddfs.get('disco:test:tag')['urls'], [['urls']])
        self.ddfs.put('disco:test:tag', [['tags']])
        self.assertEquals(self.ddfs.get('disco:test:tag')['urls'], [['tags']])
        self.ddfs.delete('tag://disco:test:tag')

    def test_delete(self):
        self.ddfs.delete('disco:test:notag')

    def tearDown(self):
        self.ddfs.delete('disco:test:notag')
        self.ddfs.delete('disco:test:tag')
        self.ddfs.delete('disco:test:blobs')
        self.ddfs.delete('disco:test:blobs2')
Пример #26
0
class DDFSUpdateTestCase(DiscoTestCase):
    data = StringIO('blobdata')

    def setUp(self):
        self.ddfs = DDFS(self.disco_master_url)

    def blobnames(self, tag):
        return list(reversed(list(DDFS.blob_name(repl[0])
                                  for repl in self.ddfs.blobs(tag))))

    def test_update(self):
        for i in range(5):
            self.ddfs.push('disco:test:blobs',
                           [(self.data, 'dup')] * 2,
                           update=True)
        self.assertEquals(len(self.blobnames('disco:test:blobs')), 1)
        for i in range(5):
            self.ddfs.push('disco:test:blobs',
                           [(self.data, 'dup2')],
                           update=True,
                           delayed=True)
        self.assertEquals(len(self.blobnames('disco:test:blobs')), 2)
        self.ddfs.delete('disco:test:blobs')

    def test_random(self):
        import random
        keys = [str(random.randint(1, 100)) for i in range(1000)]
        ukeys = []
        for key in keys:
            self.ddfs.push('disco:test:blobs', [(self.data, key)], update=True)
            if key not in ukeys:
                ukeys.append(key)
        self.assertEquals(ukeys, self.blobnames('disco:test:blobs'))
        self.ddfs.delete('disco:test:blobs')

    def test_mixed(self):
        keys = []
        for key in map(str, range(10)):
            self.ddfs.push('disco:test:blobs', [(self.data, key)] * 2)
            keys += [key] * 2
        for key in map(str, range(15)):
            self.ddfs.push('disco:test:blobs',
                           [(self.data, key)] * 2,
                           update=True)
            if int(key) > 9:
                keys.append(key)
        for key in map(str, range(10)):
            self.ddfs.push('disco:test:blobs',
                           [(self.data, key)] * 2,
                           delayed=True)
            keys += [key] * 2
        self.assertEquals(keys, self.blobnames('disco:test:blobs'))
        self.ddfs.delete('disco:test:blobs')

    def tearDown(self):
        self.ddfs.delete('disco:test:blobs')
Пример #27
0
def list_by_tag(tag):
    """List all blobs pushed to DDFS by tag"""
    ddfs = DDFS()
    return ddfs.list(tag)
Пример #28
0
        cur_date = datetime.strptime(cur_date, "%Y-%m-%d") + timedelta(days=1)
        cur_date = cur_date.strftime("%Y-%m-%d")
    return days


if __name__ == "__main__":
    from twitter_filter import TweetFilter
    from disco.core import result_iterator
    from disco.ddfs import DDFS
    import sys

    start_day = sys.argv[1]
    end_day = sys.argv[2]
    keyword_file = sys.argv[3]

    ddfs = DDFS()
    days = get_days(start_day, end_day)
    tags = []
    for day in days:
        tags = tags + ddfs.list("enriched:%s" % day)
    job_name = "Tweet_filter"
    params = json.load(open(keyword_file))
    inputs = [("tag://%s") % tag for tag in tags]
    print "Days[%d], Files[%d]" % (len(days), len(inputs))
    job = TweetFilter().run(input=inputs, partitions=len(days), params=params, name=job_name)
    result = job.wait(show=False)
    out_file = "filtered_tweet_company.txt"
    with open(out_file, "w") as ow:
        for k, v in result_iterator(result):
            ow.write(v + "\n")
Пример #29
0
 def runTest(self):
     results = sorted(list(self.results))
     ddfs = DDFS(self.disco_master_url)
     tag = self.disco.results(self.job.name)[1][0]
     self.assertEquals(len(list(ddfs.blobs(tag))), len(self.inputs))
     self.assertEquals(self.answers, results)
Пример #30
0
def save_oob(host, name, key, value, ddfs_token=None):
    from disco.ddfs import DDFS
    DDFS(host).push(DDFS.job_oob(name), [(StringIO(value), key)], delayed=True)
Пример #31
0
 def setUp(self):
     self.d = DDFS()
     wait_for_gc_to_finish(self.d)
     with open(FILE, 'w') as f:
         print >>f, "hello world!"
Пример #32
0
 def ddfs(self):
     from disco.ddfs import DDFS
     return DDFS(settings=self.settings)
Пример #33
0
Файл: core.py Проект: hmas/disco
 def ddfs(self):
     from disco.ddfs import DDFS
     return DDFS(self.master)
Пример #34
0
def save_oob(host, name, key, value, ddfs_token=None):
    from disco.ddfs import DDFS
    DDFS(host).push(DDFS.job_oob(name), [(StringIO(value), key)], delayed=True)
Пример #35
0
class DDFSReadTestCase(DiscoTestCase):
    def setUp(self):
        from cStringIO import StringIO
        self.ddfs = DDFS(self.disco_master_url)
        self.ddfs.push('disco:test:blobs', [(StringIO('datablob'), 'blobdata')])
        self.ddfs.push('disco:test:blobs', [(StringIO('datablob2'), 'blobdata2')])
        self.ddfs.push('disco:test:emptyblob', [(StringIO(''), 'empty')])
        self.ddfs.tag('disco:test:tag', [['urls']])
        self.ddfs.tag('disco:test:metatag',
                      [['tag://disco:test:tag'], ['tag://disco:test:metatag']])

    def test_blobs(self):
        from os.path import basename
        blobs = list(self.ddfs.blobs('disco:test:blobs'))
        self.assert_(basename(blobs[0][0]).startswith('blobdata'))
        self.assertCommErrorCode(404,
                                 lambda: list(self.ddfs.blobs('disco:test:notag',
                                                         ignore_missing=False)))
        self.assertEquals(list(self.ddfs.blobs('disco:test:notag')), [])

    def test_pull(self):
        self.assertEquals([(self.ddfs.blob_name(url), fd.read())
                           for fd, sze, url in self.ddfs.pull('disco:test:blobs')],
                          [('blobdata2', 'datablob2'), ('blobdata', 'datablob')])
        self.assertEquals([(self.ddfs.blob_name(url), fd.read())
                           for fd, sze, url in self.ddfs.pull('disco:test:blobs',
                                                              blobfilter=lambda b: '2' in b)],
                          [('blobdata2', 'datablob2')])
        self.assertEquals([(sze, fd.read()) for fd, sze, url in
                           self.ddfs.pull('disco:test:emptyblob')], [(0, '')])
        self.assertCommErrorCode(404, self.ddfs.pull('disco:test:notag').next)

    def test_exists(self):
        self.assertEquals(self.ddfs.exists(''), False)
        self.assertEquals(self.ddfs.exists('!!'), False)
        self.assertEquals(self.ddfs.exists('disco:test:tag'), True)
        self.assertEquals(self.ddfs.exists('disco:test:notag'), False)
        self.assertEquals(self.ddfs.exists('tag://disco:test:tag'), True)
        self.assertEquals(self.ddfs.exists('tag://disco:test:notag'), False)

    def test_findtags(self):
        list(self.ddfs.findtags(['disco:test:metatag']))

    def test_get(self):
        self.assertCommErrorCode(403, lambda: self.ddfs.get(''))
        self.assertCommErrorCode(404, lambda: self.ddfs.get('disco:test:notag'))
        self.assertEquals(self.ddfs.get('disco:test:tag')['urls'], [['urls']])
        self.assertEquals(self.ddfs.get(['disco:test:tag'])['urls'], [['urls']])

    def test_list(self):
        self.assert_('disco:test:tag' in self.ddfs.list())
        self.assert_('disco:test:tag' in self.ddfs.list('disco:test'))
        self.assertEquals(self.ddfs.list('disco:test:notag'), [])

    def test_walk(self):
        list(self.ddfs.walk('disco:test:tag'))

    def tearDown(self):
        self.ddfs.delete('disco:test:blobs')
        self.ddfs.delete('disco:test:emptyblob')
        self.ddfs.delete('disco:test:tag')
        self.ddfs.delete('disco:test:metatag')
Пример #36
0
Файл: test.py Проект: yuj/disco
 def ddfs(self):
     return DDFS(settings=self.settings)
Пример #37
0
class Docset(object):
    """
    A `Docset` represents a set of documents, contained in dump files stored on
    DDFS. Class instantiation alone doesn't do anything to DDFS; the DDFS tag
    for this docset won't exist until a dump is added.
    """

    def __init__(self, docset_name):
        self.ddfs_tag = docset_name
        self.ddfs_index_tag = docset_name + ':index'
        self.ddfs_link_file_tag = docset_name + ':links'
        self.ddfs = DDFS()
        self.__index = None
        self.dirty = False

    def exists(self):
        """Returns True if this Docset exists in DDFS."""
        return self.ddfs.exists(self.ddfs_tag)
        
    def delete(self):
        """
        Deletes this tag from DDFS. DDFS garbage collection will soon take care
        of dumps in this docset with no other tags. If other docsets link to
        this docset's dumps, then those dumps will remain.
        """
        self.ddfs.delete(self.ddfs_index_tag)
        self.ddfs.delete(self.ddfs_tag)

    INDEX_VERSION_PAD = 4
    @property
    def index(self):
        # Lazily load index data from DDFS.
        if self.__index is None:
            blobs = [uri for (uri,) in self.ddfs.blobs(self.ddfs_index_tag)]
            if len(blobs) == 0:
                self.__index = {}
                self.__index_version = 0
            else:
                # Find blob with highest version number.
                ver, discouri = sorted([(self.__blob_uri_to_dump_name(uri), uri)
                                        for uri in blobs], reverse=True)[0]
                uri = urlresolve(discouri)
                data = urllib2.urlopen(uri).read()
                try:
                    self.__index = pickle.loads(data)
                    self.__index_version = int(ver)
                except EOFError:
                    raise EOFError("EOF reading docset index at %s in tag %s" % \
                                       (uri, self.ddfs_index_tag))
        return self.__index

    def save(self):
        self.index # load if hasn't been loaded yet
        self.__index_version += 1
        ver = "%0*d" % (self.INDEX_VERSION_PAD, self.__index_version)
        tmp_fname = os.path.join("/tmp/", "%s%s" % (self.ddfs_index_tag, ver))
        with open(tmp_fname, 'w+b') as f:
            pickle.dump(self.__index, f)
            f.flush()
            f.seek(0)
            self.ddfs.push(self.ddfs_index_tag, [(f, ver)])
        self.dirty = False

    def add_dump(self, dumpname, dump):
        """
        Adds a dump to this docset and indexes its documents by position,
        uploading the dump to DDFS with the tag for this docset.
        """
        # index positions
        startpos = 0
        endpos = None
        with open(dump, 'rb') as f:
            dociter = WARCParser(f)
            for doc in dociter:
                endpos = dociter.tell()
                self.index[doc.uri] = (dumpname, startpos, endpos - startpos)
                startpos = endpos
        self.ddfs.push(self.ddfs_tag, [(dump, dumpname)])
        self.dirty = True

    @property
    def doc_count(self):
        """
        Returns the total number of documents contained in all dumps in this
        docset.
        """
        return len(self.index)

    def doc_uris(self):
        """Returns all URIs of documents contained in all dumps in this
        docset."""
        return self.index.keys()

    def dump_uris(self):
        """
        Returns disco:// URIs for each dump in the docset. Use
        disco.util.urlresolve to convert the disco:// URIs to http:// URIs.
        """
        return (uri for (uri,) in self.ddfs.blobs(self.ddfs_tag))

    def __blob_uri_to_dump_name(self, bloburi):
        """
        Takes a blob URI like
           disco://host/ddfs/vol0/blob/b4/dumpname$4fd-ea750-6d4e1
        and returns "dumpname".
        """
        return re.search(r'/([\w0-9_\-@:]+)\$', bloburi).group(1)

    def __dump_name_to_blob_uri(self, dumpname):
        """
        Takes a dump name like "dumpname" and returns the blob URI like
           disco://host/ddfs/vol0/blob/b4/dumpname$000-11111-fffff
        """
        for uri in self.dump_uris():
            if dumpname == self.__blob_uri_to_dump_name(uri):
                return uri
        raise KeyError
    
    def dump_names(self):
        """Returns the names of dumps in the docset."""
        return [self.__blob_uri_to_dump_name(uri) for uri in self.dump_uris()]

    def get_pos(self, uri):
        """Returns a tuple `(dump_name, byte pos)` of the location of the
        document `uri` in the docset."""
        if uri in self.index:
            return self.index[uri]
        else:
            raise DocumentNotFound()
        
    def get(self, uri):
        """Returns the `Document` with the specified `uri`."""
        name, startpos, size = self.get_pos(uri)
        try:
            dump_uri = urlresolve(self.__dump_name_to_blob_uri(name))
        except KeyError:
            raise DocumentNotFound("couldn't find doc with dump name '%s'" % name)

        req = urllib2.Request(dump_uri)
        req.add_header("Range", "bytes=%d-%d" % (startpos, startpos + size - 1))
        res = urllib2.urlopen(req)
        return WARCParser(res).next()
Пример #38
0
from discodex import settings
from discodex.mapreduce import (Indexer, DiscoDBIterator)
from discodex.objects import (DataSet, IChunks, Indices, Index, Results, Dict)

from disco.core import Disco
from disco.ddfs import DDFS
from disco.error import DiscoError
from disco.util import ddfs_name, flatten, parse_dir

discodex_settings = settings.DiscodexSettings()
disco_master_url = discodex_settings['DISCODEX_DISCO_MASTER']
disco_prefix = discodex_settings['DISCODEX_DISCO_PREFIX']
index_prefix = discodex_settings['DISCODEX_INDEX_PREFIX']
purge_file = discodex_settings['DISCODEX_PURGE_FILE']
disco_master = Disco(disco_master_url)
ddfs = DDFS(disco_master_url)

NOT_FOUND, OK, ACTIVE, DEAD = 'unknown job', 'ready', 'active', 'dead'


class IndexCollection(Collection):
    allowed_methods = ('GET', 'POST')

    def delegate(self, request, *args, **kwargs):
        name = str(kwargs.pop('name'))
        return IndexResource(name)(request, *args, **kwargs)

    @property
    def names(self):
        return ddfs.list(index_prefix)