예제 #1
0
class DDFSReadTestCase(DiscoTestCase):
    def setUp(self):
        self.ddfs = DDFS(self.disco_master_url)
        self.ddfs.push('disco:test:blobs', [(StringIO('datablob'), 'blobdata')])
        self.ddfs.push('disco:test:blobs', [(StringIO('datablob2'), 'blobdata2')])
        self.ddfs.push('disco:test:emptyblob', [(StringIO(''), 'empty')])
        self.ddfs.tag('disco:test:tag', [['urls']])
        self.ddfs.tag('disco:test:metatag',
                      [['tag://disco:test:tag'], ['tag://disco:test:metatag']])

    def test_blobs(self):
        from os.path import basename
        blobs = list(self.ddfs.blobs('disco:test:blobs'))
        self.assert_(basename(blobs[0][0]).startswith('blobdata'))
        self.assertCommErrorCode(404,
                                 lambda: list(self.ddfs.blobs('disco:test:notag',
                                                         ignore_missing=False)))
        self.assertEquals(list(self.ddfs.blobs('disco:test:notag')), [])

    def test_pull(self):
        self.assertEquals([(self.ddfs.blob_name(url), fd.read())
                           for fd, sze, url in self.ddfs.pull('disco:test:blobs')],
                          [('blobdata2', 'datablob2'), ('blobdata', 'datablob')])
        self.assertEquals([(self.ddfs.blob_name(url), fd.read())
                           for fd, sze, url in self.ddfs.pull('disco:test:blobs',
                                                              blobfilter=lambda b: '2' in b)],
                          [('blobdata2', 'datablob2')])
        self.assertEquals([(sze, fd.read()) for fd, sze, url in
                           self.ddfs.pull('disco:test:emptyblob')], [(0, '')])
        self.assertCommErrorCode(404, self.ddfs.pull('disco:test:notag').next)

    def test_exists(self):
        self.assertEquals(self.ddfs.exists('disco:test:tag'), True)
        self.assertEquals(self.ddfs.exists('disco:test:notag'), False)
        self.assertEquals(self.ddfs.exists('tag://disco:test:tag'), True)
        self.assertEquals(self.ddfs.exists('tag://disco:test:notag'), False)

    def test_findtags(self):
        list(self.ddfs.findtags(['disco:test:metatag']))

    def test_get(self):
        self.assertCommErrorCode(404, lambda: self.ddfs.get('disco:test:notag'))
        self.assertEquals(self.ddfs.get('disco:test:tag')['urls'], [['urls']])
        self.assertEquals(self.ddfs.get(['disco:test:tag'])['urls'], [['urls']])

    def test_list(self):
        self.assert_('disco:test:tag' in self.ddfs.list())
        self.assert_('disco:test:tag' in self.ddfs.list('disco:test'))
        self.assertEquals(self.ddfs.list('disco:test:notag'), [])

    def test_walk(self):
        list(self.ddfs.walk('disco:test:tag'))

    def tearDown(self):
        self.ddfs.delete('disco:test:blobs')
        self.ddfs.delete('disco:test:emptyblob')
        self.ddfs.delete('disco:test:tag')
        self.ddfs.delete('disco:test:metatag')
예제 #2
0
class DDFSWriteTestCase(DiscoTestCase):
    def setUp(self):
        self.ddfs = DDFS(self.disco_master_url)

    def test_chunk(self):
        from disco.core import RecordIter
        url = 'http://discoproject.org/media/text/chekhov.txt'
        self.ddfs.chunk('disco:test:chunk', [url], chunk_size=100*1024)
        self.assert_(0 < len(list(self.ddfs.blobs('disco:test:chunk'))) <= 4)
        self.assert_(list(RecordIter(['tag://disco:test:chunk'])),
                     list(RecordIter([url], reader=None)))
        self.ddfs.delete('disco:test:chunk')

    def test_push(self):
        self.ddfs.push('disco:test:blobs', [(StringIO('blobdata'), 'blobdata')])
        self.assert_(self.ddfs.exists('disco:test:blobs'))
        self.ddfs.push('tag://disco:test:blobs2', [(StringIO('blobdata'), 'blobdata')])
        self.assert_(self.ddfs.exists('disco:test:blobs2'))
        self.ddfs.delete('disco:test:blobs')
        self.assert_(not self.ddfs.exists('disco:test:blobs'))
        self.ddfs.delete('disco:test:blobs2')
        self.assert_(not self.ddfs.exists('disco:test:blobs2'))

    def test_tag(self):
        self.ddfs.tag('disco:test:tag', [['urls']])
        self.assert_(self.ddfs.exists('disco:test:tag'))
        self.ddfs.delete('disco:test:tag')
        self.assert_(not self.ddfs.exists('disco:test:tag'))
        self.ddfs.tag('tag://disco:test:tag', [['urls']])
        self.assert_(self.ddfs.exists('tag://disco:test:tag'))
        self.ddfs.tag('disco:test:tag', [['more_urls']])
        self.assertEquals(sorted(self.ddfs.get('disco:test:tag')['urls']),
                          sorted([['urls'], ['more_urls']]))
        self.ddfs.delete('tag://disco:test:tag')
        self.assert_(not self.ddfs.exists('tag://disco:test:tag'))

    def test_put(self):
        self.ddfs.put('disco:test:tag', [['urls']])
        self.assert_(self.ddfs.exists('disco:test:tag'))
        self.assertEquals(self.ddfs.get('disco:test:tag')['urls'], [['urls']])
        self.ddfs.put('disco:test:tag', [['tags']])
        self.assertEquals(self.ddfs.get('disco:test:tag')['urls'], [['tags']])
        self.ddfs.delete('tag://disco:test:tag')

    def test_delete(self):
        self.ddfs.delete('disco:test:notag')

    def tearDown(self):
        self.ddfs.delete('disco:test:notag')
        self.ddfs.delete('disco:test:tag')
        self.ddfs.delete('disco:test:blobs')
        self.ddfs.delete('disco:test:blobs2')
예제 #3
0
class DDFSWriteTestCase(DiscoTestCase):
    def setUp(self):
        self.ddfs = DDFS(self.disco_master_url)

    def test_push(self):
        from cStringIO import StringIO
        self.ddfs.push('disco:test:blobs', [(StringIO('blobdata'), 'blobdata')])
        self.assert_(self.ddfs.exists('disco:test:blobs'))
        self.ddfs.push('tag://disco:test:blobs2', [(StringIO('blobdata'), 'blobdata')])
        self.assert_(self.ddfs.exists('disco:test:blobs2'))
        self.ddfs.delete('disco:test:blobs')
        self.assert_(not self.ddfs.exists('disco:test:blobs'))
        self.ddfs.delete('disco:test:blobs2')
        self.assert_(not self.ddfs.exists('disco:test:blobs2'))

    def test_tag(self):
        self.ddfs.tag('disco:test:tag', [['urls']])
        self.assert_(self.ddfs.exists('disco:test:tag'))
        self.ddfs.delete('disco:test:tag')
        self.assert_(not self.ddfs.exists('disco:test:tag'))
        self.ddfs.tag('tag://disco:test:tag', [['urls']])
        self.assert_(self.ddfs.exists('tag://disco:test:tag'))
        self.ddfs.tag('disco:test:tag', [['more_urls']])
        self.assertEquals(sorted(self.ddfs.get('disco:test:tag')['urls']),
                          sorted([['urls'], ['more_urls']]))
        self.ddfs.delete('tag://disco:test:tag')
        self.assert_(not self.ddfs.exists('tag://disco:test:tag'))

    def test_put(self):
        self.ddfs.put('disco:test:tag', [['urls']])
        self.assert_(self.ddfs.exists('disco:test:tag'))
        self.assertEquals(self.ddfs.get('disco:test:tag')['urls'], [['urls']])
        self.ddfs.put('disco:test:tag', [['tags']])
        self.assertEquals(self.ddfs.get('disco:test:tag')['urls'], [['tags']])
        self.ddfs.delete('tag://disco:test:tag')

    def test_delete(self):
        self.ddfs.delete('disco:test:notag')
예제 #4
0
파일: docset.py 프로젝트: sqs/freequery
class Docset(object):
    """
    A `Docset` represents a set of documents, contained in dump files stored on
    DDFS. Class instantiation alone doesn't do anything to DDFS; the DDFS tag
    for this docset won't exist until a dump is added.
    """

    def __init__(self, docset_name):
        self.ddfs_tag = docset_name
        self.ddfs_index_tag = docset_name + ':index'
        self.ddfs_link_file_tag = docset_name + ':links'
        self.ddfs = DDFS()
        self.__index = None
        self.dirty = False

    def exists(self):
        """Returns True if this Docset exists in DDFS."""
        return self.ddfs.exists(self.ddfs_tag)
        
    def delete(self):
        """
        Deletes this tag from DDFS. DDFS garbage collection will soon take care
        of dumps in this docset with no other tags. If other docsets link to
        this docset's dumps, then those dumps will remain.
        """
        self.ddfs.delete(self.ddfs_index_tag)
        self.ddfs.delete(self.ddfs_tag)

    INDEX_VERSION_PAD = 4
    @property
    def index(self):
        # Lazily load index data from DDFS.
        if self.__index is None:
            blobs = [uri for (uri,) in self.ddfs.blobs(self.ddfs_index_tag)]
            if len(blobs) == 0:
                self.__index = {}
                self.__index_version = 0
            else:
                # Find blob with highest version number.
                ver, discouri = sorted([(self.__blob_uri_to_dump_name(uri), uri)
                                        for uri in blobs], reverse=True)[0]
                uri = urlresolve(discouri)
                data = urllib2.urlopen(uri).read()
                try:
                    self.__index = pickle.loads(data)
                    self.__index_version = int(ver)
                except EOFError:
                    raise EOFError("EOF reading docset index at %s in tag %s" % \
                                       (uri, self.ddfs_index_tag))
        return self.__index

    def save(self):
        self.index # load if hasn't been loaded yet
        self.__index_version += 1
        ver = "%0*d" % (self.INDEX_VERSION_PAD, self.__index_version)
        tmp_fname = os.path.join("/tmp/", "%s%s" % (self.ddfs_index_tag, ver))
        with open(tmp_fname, 'w+b') as f:
            pickle.dump(self.__index, f)
            f.flush()
            f.seek(0)
            self.ddfs.push(self.ddfs_index_tag, [(f, ver)])
        self.dirty = False

    def add_dump(self, dumpname, dump):
        """
        Adds a dump to this docset and indexes its documents by position,
        uploading the dump to DDFS with the tag for this docset.
        """
        # index positions
        startpos = 0
        endpos = None
        with open(dump, 'rb') as f:
            dociter = WARCParser(f)
            for doc in dociter:
                endpos = dociter.tell()
                self.index[doc.uri] = (dumpname, startpos, endpos - startpos)
                startpos = endpos
        self.ddfs.push(self.ddfs_tag, [(dump, dumpname)])
        self.dirty = True

    @property
    def doc_count(self):
        """
        Returns the total number of documents contained in all dumps in this
        docset.
        """
        return len(self.index)

    def doc_uris(self):
        """Returns all URIs of documents contained in all dumps in this
        docset."""
        return self.index.keys()

    def dump_uris(self):
        """
        Returns disco:// URIs for each dump in the docset. Use
        disco.util.urlresolve to convert the disco:// URIs to http:// URIs.
        """
        return (uri for (uri,) in self.ddfs.blobs(self.ddfs_tag))

    def __blob_uri_to_dump_name(self, bloburi):
        """
        Takes a blob URI like
           disco://host/ddfs/vol0/blob/b4/dumpname$4fd-ea750-6d4e1
        and returns "dumpname".
        """
        return re.search(r'/([\w0-9_\-@:]+)\$', bloburi).group(1)

    def __dump_name_to_blob_uri(self, dumpname):
        """
        Takes a dump name like "dumpname" and returns the blob URI like
           disco://host/ddfs/vol0/blob/b4/dumpname$000-11111-fffff
        """
        for uri in self.dump_uris():
            if dumpname == self.__blob_uri_to_dump_name(uri):
                return uri
        raise KeyError
    
    def dump_names(self):
        """Returns the names of dumps in the docset."""
        return [self.__blob_uri_to_dump_name(uri) for uri in self.dump_uris()]

    def get_pos(self, uri):
        """Returns a tuple `(dump_name, byte pos)` of the location of the
        document `uri` in the docset."""
        if uri in self.index:
            return self.index[uri]
        else:
            raise DocumentNotFound()
        
    def get(self, uri):
        """Returns the `Document` with the specified `uri`."""
        name, startpos, size = self.get_pos(uri)
        try:
            dump_uri = urlresolve(self.__dump_name_to_blob_uri(name))
        except KeyError:
            raise DocumentNotFound("couldn't find doc with dump name '%s'" % name)

        req = urllib2.Request(dump_uri)
        req.add_header("Range", "bytes=%d-%d" % (startpos, startpos + size - 1))
        res = urllib2.urlopen(req)
        return WARCParser(res).next()