示例#1
0
    def copy_rec_files(self, user, collection, recording, warc_files):
        if self.dry_run:
            target_dirname = os.path.join('/tmp/migrate4.0', collection.my_id)
        else:
            target_dirname = user.get_user_temp_warc_path()

        os.makedirs(target_dirname, exist_ok=True)
        print('Writing to dir: ' + target_dirname)

        coll_warc_key = recording.COLL_WARC_KEY.format(coll=collection.my_id)
        rec_warc_key = recording.REC_WARC_KEY.format(rec=recording.my_id)

        # Copy WARCs
        loader = BlockLoader()
        total_size = 0

        for n, url in warc_files.items():
            if not url.startswith('s3://'):
                print('FILE ERR: Skipping local file: ' + url)
                continue

            local_filename = n if n != recording.INDEX_FILE_KEY else os.path.basename(
                url)
            target_file = os.path.join(target_dirname, local_filename)

            src = loader.load(url)

            try:
                with open(target_file, 'wb') as dest:
                    print('Copying {0} -> {1}'.format(url, target_file))
                    shutil.copyfileobj(src, dest)
                    size = dest.tell()

                target_file = add_local_store_prefix(target_file)
                if n != recording.INDEX_FILE_KEY:
                    self.redis.hset(coll_warc_key, n, target_file)
                    self.redis.sadd(rec_warc_key, n)
                    total_size += size
                else:
                    recording.set_prop(n, target_file, update_ts=False)

                if self.dry_run:
                    os.remove(strip_prefix(target_file))

            except:
                import traceback
                traceback.print_exc()

        # commit from temp dir to storage
        if not self.dry_run:
            recording.commit_to_storage()

        return total_size
示例#2
0
    def copy_rec_files(self, user, collection, recording, warc_files):
        if self.dry_run:
            target_dirname = os.path.join('/tmp/migrate4.0', collection.my_id)
        else:
            target_dirname = user.get_user_temp_warc_path()

        os.makedirs(target_dirname, exist_ok=True)
        print('Writing to dir: ' + target_dirname)

        coll_warc_key = recording.COLL_WARC_KEY.format(coll=collection.my_id)
        rec_warc_key = recording.REC_WARC_KEY.format(rec=recording.my_id)

        # Copy WARCs
        loader = BlockLoader()
        total_size = 0

        for n, url in warc_files.items():
            if not url.startswith('s3://'):
                print('FILE ERR: Skipping local file: ' + url)
                continue

            local_filename = n if n != recording.INDEX_FILE_KEY else os.path.basename(url)
            target_file = os.path.join(target_dirname, local_filename)

            src = loader.load(url)

            try:
                with open(target_file, 'wb') as dest:
                    print('Copying {0} -> {1}'.format(url, target_file))
                    shutil.copyfileobj(src, dest)
                    size = dest.tell()

                target_file = add_local_store_prefix(target_file)
                if n != recording.INDEX_FILE_KEY:
                    self.redis.hset(coll_warc_key, n, target_file)
                    self.redis.sadd(rec_warc_key, n)
                    total_size += size
                else:
                    recording.set_prop(n, target_file, update_ts=False)

                if self.dry_run:
                    os.remove(strip_prefix(target_file))

            except:
                import traceback
                traceback.print_exc()

        # commit from temp dir to storage
        if not self.dry_run:
            recording.commit_to_storage()

        return total_size
示例#3
0
文件: handlers.py 项目: tilgovi/pywb
class StaticHandler(BaseHandler):
    def __init__(self, static_path):
        mimetypes.init()

        self.static_path = static_path
        self.block_loader = BlockLoader()

    def __call__(self, wbrequest):
        url = wbrequest.wb_url_str.split('?')[0]
        full_path = self.static_path + url

        try:
            data = self.block_loader.load(full_path)

            try:
                data.seek(0, 2)
                size = data.tell()
                data.seek(0)
                headers = [('Content-Length', str(size))]
            except IOError:
                headers = None

            if 'wsgi.file_wrapper' in wbrequest.env:
                reader = wbrequest.env['wsgi.file_wrapper'](data)
            else:
                reader = iter(lambda: data.read(), '')

            content_type = 'application/octet-stream'

            guessed = mimetypes.guess_type(full_path)
            if guessed[0]:
                content_type = guessed[0]

            return WbResponse.text_stream(data,
                                          content_type=content_type,
                                          headers=headers)

        except IOError:
            raise NotFoundException('Static File Not Found: ' +
                                    wbrequest.wb_url_str)

    def __str__(self):  # pragma: no cover
        return 'Static files from ' + self.static_path
示例#4
0
文件: handlers.py 项目: tilgovi/pywb
class StaticHandler(BaseHandler):
    def __init__(self, static_path):
        mimetypes.init()

        self.static_path = static_path
        self.block_loader = BlockLoader()

    def __call__(self, wbrequest):
        url = wbrequest.wb_url_str.split('?')[0]
        full_path = self.static_path + url

        try:
            data = self.block_loader.load(full_path)

            try:
                data.seek(0, 2)
                size = data.tell()
                data.seek(0)
                headers = [('Content-Length', str(size))]
            except IOError:
                headers = None

            if 'wsgi.file_wrapper' in wbrequest.env:
                reader = wbrequest.env['wsgi.file_wrapper'](data)
            else:
                reader = iter(lambda: data.read(), '')

            content_type = 'application/octet-stream'

            guessed = mimetypes.guess_type(full_path)
            if guessed[0]:
                content_type = guessed[0]

            return WbResponse.text_stream(data,
                                          content_type=content_type,
                                          headers=headers)

        except IOError:
            raise NotFoundException('Static File Not Found: ' +
                                    wbrequest.wb_url_str)

    def __str__(self):  # pragma: no cover
        return 'Static files from ' + self.static_path
示例#5
0
    def wasapi_download(self, username, coll_name, filename):
        user = self._get_wasapi_user(username)

        if not user:
            self._raise_error(404, 'no_such_user')

        collection = user.get_collection_by_name(coll_name)

        if not collection:
            self._raise_error(404, 'no_such_collection')

        # self.access.assert_is_curr_user(user)
        # only users with write access can use wasapi
        self.access.assert_can_write_coll(collection)

        warc_key = collection.get_warc_key()
        warc_path = self.redis.hget(warc_key, filename)

        if not warc_path:
            self._raise_error(404, 'file_not_found')

        response.headers['Content-Type'] = 'application/octet-stream'
        response.headers[
            'Content-Disposition'] = "attachment; filename*=UTF-8''" + filename
        response.headers['Transfer-Encoding'] = 'chunked'

        loader = BlockLoader()
        fh = None
        try:
            fh = loader.load(warc_path)
        except Exception:
            self._raise_error(400, 'file_load_error')

        def read_all(fh):
            for chunk in StreamIter(fh):
                yield chunk

        return read_all(fh)
示例#6
0
文件: zipnum.py 项目: whitten/pywb
class ZipNumIndexSource(BaseIndexSource):
    DEFAULT_RELOAD_INTERVAL = 10  # in minutes
    DEFAULT_MAX_BLOCKS = 10
    IDX_EXT = ('.idx', '.summary')

    def __init__(self, summary, config=None):
        self.max_blocks = self.DEFAULT_MAX_BLOCKS

        self.loc_resolver = None
        self.config = config or {}

        loc = None
        cookie_maker = None
        reload_ival = self.DEFAULT_RELOAD_INTERVAL

        if config:
            loc = config.get('shard_index_loc')
            cookie_maker = config.get('cookie_maker')

            self.max_blocks = config.get('max_blocks', self.max_blocks)

            reload_ival = config.get('reload_interval', reload_ival)

        if isinstance(loc, dict):
            self.loc_resolver = LocPrefixResolver(summary, loc)
        else:
            self.loc_resolver = LocMapResolver(summary, loc)

        self.summary = summary

        # reload interval
        self.loc_update_time = datetime.datetime.now()
        self.reload_interval = datetime.timedelta(minutes=reload_ival)

        self.blk_loader = BlockLoader(cookie_maker=cookie_maker)

    def load_index(self, params):
        self.loc_resolver.load_loc()
        return self._do_load_cdx(self.summary, CDXQuery(params))

    def _do_load_cdx(self, filename, query):
        reader = open(filename, 'rb')

        idx_iter = self.compute_page_range(reader, query)

        if query.secondary_index_only:

            def gen_idx():
                for idx in idx_iter:
                    yield IDXObject(idx)

            return gen_idx()

        if query.page_count:
            return idx_iter

        blocks = self.idx_to_cdx(idx_iter, query)

        def gen_cdx():
            for blk in blocks:
                for cdx in blk:
                    yield CDXObject(cdx)

        return gen_cdx()

    def _page_info(self, pages, pagesize, blocks):
        info = AlwaysJsonResponse(pages=pages,
                                  pageSize=pagesize,
                                  blocks=blocks)

        return info

    def compute_page_range(self, reader, query):
        pagesize = query.page_size
        if not pagesize:
            pagesize = self.max_blocks
        else:
            pagesize = int(pagesize)

        last_line = None

        # Get End
        end_iter = search(reader, query.end_key, prev_size=1)

        try:
            end_line = six.next(end_iter)
        except StopIteration:
            last_line = read_last_line(reader)
            end_line = last_line

        # Get Start
        first_iter = iter_range(reader, query.key, query.end_key, prev_size=1)

        try:
            first_line = six.next(first_iter)
        except StopIteration:
            if end_line == last_line and query.key >= last_line:
                first_line = last_line
            else:
                reader.close()
                if query.page_count:
                    yield self._page_info(0, pagesize, 0)
                return

        first = IDXObject(first_line)

        end = IDXObject(end_line)

        try:
            blocks = end['lineno'] - first['lineno']
            total_pages = int(blocks / pagesize) + 1
        except:
            blocks = -1
            total_pages = 1

        if query.page_count:
            # same line, so actually need to look at cdx
            # to determine if it exists
            if blocks == 0:
                try:
                    block_cdx_iter = self.idx_to_cdx([first_line], query)
                    block = six.next(block_cdx_iter)
                    cdx = six.next(block)
                except StopIteration:
                    total_pages = 0
                    blocks = -1

            yield self._page_info(total_pages, pagesize, blocks + 1)
            reader.close()
            return

        curr_page = query.page
        if curr_page >= total_pages or curr_page < 0:
            msg = 'Page {0} invalid: First Page is 0, Last Page is {1}'
            reader.close()
            raise CDXException(msg.format(curr_page, total_pages - 1))

        startline = curr_page * pagesize
        endline = startline + pagesize - 1
        if blocks >= 0:
            endline = min(endline, blocks)

        if curr_page == 0:
            yield first_line
        else:
            startline -= 1

        idxiter = itertools.islice(first_iter, startline, endline)
        for idx in idxiter:
            yield idx

        reader.close()

    def search_by_line_num(self, reader, line):  # pragma: no cover
        def line_cmp(line1, line2):
            line1_no = int(line1.rsplit(b'\t', 1)[-1])
            line2_no = int(line2.rsplit(b'\t', 1)[-1])
            return cmp(line1_no, line2_no)

        line_iter = search(reader, line, compare_func=line_cmp)
        yield six.next(line_iter)

    def idx_to_cdx(self, idx_iter, query):
        blocks = None
        ranges = []

        for idx in idx_iter:
            idx = IDXObject(idx)

            if (blocks and blocks.part == idx['part']
                    and blocks.offset + blocks.length == idx['offset']
                    and blocks.count < self.max_blocks):

                blocks.length += idx['length']
                blocks.count += 1
                ranges.append(idx['length'])

            else:
                if blocks:
                    yield self.block_to_cdx_iter(blocks, ranges, query)

                blocks = ZipBlocks(idx['part'], idx['offset'], idx['length'],
                                   1)

                ranges = [blocks.length]

        if blocks:
            yield self.block_to_cdx_iter(blocks, ranges, query)

    def block_to_cdx_iter(self, blocks, ranges, query):
        last_exc = None
        last_traceback = None

        try:
            locations = self.loc_resolver(blocks.part, query)
        except:
            raise Exception('No Locations Found for: ' + blocks.part)

        for location in self.loc_resolver(blocks.part, query):
            try:
                return self.load_blocks(location, blocks, ranges, query)
            except Exception as exc:
                last_exc = exc
                import sys
                last_traceback = sys.exc_info()[2]

        if last_exc:
            six.reraise(Exception, last_exc, last_traceback)
            #raise last_exc
        else:
            raise Exception('No Locations Found for: ' + blocks.part)

    def load_blocks(self, location, blocks, ranges, query):
        """ Load one or more blocks of compressed cdx lines, return
        a line iterator which decompresses and returns one line at a time,
        bounded by query.key and query.end_key
        """
        if (logging.getLogger().getEffectiveLevel() <= logging.DEBUG):
            msg = 'Loading {b.count} blocks from {loc}:{b.offset}+{b.length}'
            logging.debug(msg.format(b=blocks, loc=location))

        reader = self.blk_loader.load(location, blocks.offset, blocks.length)

        def decompress_block(range_):
            decomp = gzip_decompressor()
            buff = decomp.decompress(reader.read(range_))
            for line in BytesIO(buff):
                yield line

        def iter_blocks(reader):
            try:
                for r in ranges:
                    yield decompress_block(r)
            finally:
                reader.close()

        # iterate over all blocks
        iter_ = itertools.chain.from_iterable(iter_blocks(reader))

        # start bound
        iter_ = linearsearch(iter_, query.key)

        # end bound
        iter_ = itertools.takewhile(lambda line: line < query.end_key, iter_)
        return iter_

    def __repr__(self):
        return 'ZipNumIndexSource({0}, {1})'.format(self.summary, self.config)

    def __str__(self):
        return 'zipnum'

    def __eq__(self, other):
        if not isinstance(other, self.__class__):
            return False

        return (self.summary == other.summary and self.config == other.config)

    @classmethod
    def init_from_string(cls, value):
        is_zipnum = False
        if value.startswith('zipnum+'):
            value = value[7:]
            is_zipnum = True

        if value.startswith('file://'):
            value = value[7:]

        if is_zipnum or value.endswith(cls.IDX_EXT):
            return cls(value, None)

    @classmethod
    def init_from_config(cls, config):
        if config['type'] != 'zipnum':
            return

        return cls(config['path'], config)
示例#7
0
class ZipNumCluster(CDXSource):
    DEFAULT_RELOAD_INTERVAL = 10  # in minutes
    DEFAULT_MAX_BLOCKS = 50

    def __init__(self, summary, config=None):

        loc = None
        cookie_maker = None
        self.max_blocks = self.DEFAULT_MAX_BLOCKS
        reload_ival = self.DEFAULT_RELOAD_INTERVAL

        if config:
            loc = config.get('zipnum_loc')
            cookie_maker = config.get('cookie_maker')

            self.max_blocks = config.get('max_blocks', self.max_blocks)

            reload_ival = config.get('reload_interval', reload_ival)

        if not loc:
            splits = os.path.splitext(summary)
            loc = splits[0] + '.loc'

        self.summary = summary
        self.loc_filename = loc

        # initial loc map
        self.loc_map = {}
        self.loc_mtime = 0
        self.load_loc()

        # reload interval
        self.loc_update_time = datetime.datetime.now()
        self.reload_interval = datetime.timedelta(minutes=reload_ival)

        self.blk_loader = BlockLoader(cookie_maker=cookie_maker)

    def load_loc(self):
        # check modified time of current file before loading
        new_mtime = os.path.getmtime(self.loc_filename)
        if (new_mtime == self.loc_mtime):
            return

        # update loc file mtime
        self.loc_mtime = new_mtime

        logging.debug('Loading loc from: ' + self.loc_filename)
        with open(self.loc_filename) as fh:
            for line in fh:
                parts = line.rstrip().split('\t')
                self.loc_map[parts[0]] = parts[1:]

#    @staticmethod
#    def reload_timed(timestamp, val, delta, func):
#        now = datetime.datetime.now()
#        if now - timestamp >= delta:
#            func()
#            return now
#        return None
#
#    def reload_loc(self):
#        reload_time = self.reload_timed(self.loc_update_time,
#                                        self.loc_map,
#                                        self.reload_interval,
#                                        self.load_loc)
#
#        if reload_time:
#            self.loc_update_time = reload_time

    def lookup_loc(self, part):
        return self.loc_map[part]

    def load_cdx(self, query):
        self.load_loc()

        reader = open(self.summary)

        idx_iter = iter_range(reader, query.key, query.end_key, prev_size=1)

        if query.secondary_index_only:
            return idx_iter
        else:
            blocks = self.idx_to_cdx(idx_iter, query)

            def gen_cdx():
                for blk in blocks:
                    for cdx in blk:
                        yield cdx

            return gen_cdx()

    def idx_to_cdx(self, idx_iter, query):
        blocks = None
        ranges = []

        for idx in idx_iter:
            idx = IDXObject(idx)

            if (blocks and blocks.part == idx['part']
                    and blocks.offset + blocks.length == idx['offset']
                    and blocks.count < self.max_blocks):

                blocks.length += idx['length']
                blocks.count += 1
                ranges.append(idx['length'])

            else:
                if blocks:
                    yield self.block_to_cdx_iter(blocks, ranges, query)

                blocks = ZipBlocks(idx['part'], idx['offset'], idx['length'],
                                   1)

                ranges = [blocks.length]

        if blocks:
            yield self.block_to_cdx_iter(blocks, ranges, query)

    def block_to_cdx_iter(self, blocks, ranges, query):
        last_exc = None
        last_traceback = None

        for location in self.lookup_loc(blocks.part):
            try:
                return self.load_blocks(location, blocks, ranges, query)
            except Exception as exc:
                last_exc = exc
                import sys
                last_traceback = sys.exc_info()[2]

        if last_exc:
            raise exc, None, last_traceback
        else:
            raise Exception('No Locations Found for: ' + block.part)

    def load_blocks(self, location, blocks, ranges, query):

        if (logging.getLogger().getEffectiveLevel() <= logging.DEBUG):
            msg = 'Loading {b.count} blocks from {loc}:{b.offset}+{b.length}'
            logging.debug(msg.format(b=blocks, loc=location))

        reader = self.blk_loader.load(location, blocks.offset, blocks.length)

        def decompress_block(range_):
            decomp = gzip_decompressor()
            buff = decomp.decompress(reader.read(range_))
            return readline_to_iter(BytesIO(buff))

        iter_ = itertools.chain(*itertools.imap(decompress_block, ranges))

        # start bound
        iter_ = linearsearch(iter_, query.key)

        # end bound
        end = query.end_key
        iter_ = itertools.takewhile(lambda line: line < end, iter_)
        return iter_

    def __str__(self):
        return 'ZipNum Cluster: {0}, {1}'.format(self.summary,
                                                 self.loc_filename)
示例#8
0
    def copy_data_from_recording(self, source, delete_source=False):
        """Copy given recording building block entries.

        :param RedisUniqueComponent source: building block
        :param bool delete_source: whether to delete source building block

        :returns: whether successful or not
        :rtype: bool
        """
        if self == source:
            return False

        if not self.is_open():
            return False

        errored = False

        self._copy_prop(source, 'title')
        self._copy_prop(source, 'desc')
        self._copy_prop(source, 'rec_type')
        self._copy_prop(source, 'recorded_at')
        #self._copy_prop(source, 'patch_rec')

        collection = self.get_owner()
        user = collection.get_owner()

        target_dirname = user.get_user_temp_warc_path()
        target_warc_key = self.COLL_WARC_KEY.format(coll=collection.my_id)

        # Copy WARCs
        loader = BlockLoader()

        for n, url in source.iter_all_files(include_index=True):
            local_filename = n + '.' + timestamp20_now()
            target_file = os.path.join(target_dirname, local_filename)

            src = loader.load(url)

            try:
                with open(target_file, 'wb') as dest:
                    logger.debug('Copying {0} -> {1}'.format(url, target_file))
                    shutil.copyfileobj(src, dest)
                    size = dest.tell()

                if n != self.INDEX_FILE_KEY:
                    self.incr_size(size)
                    self.redis.hset(target_warc_key, n, add_local_store_prefix(target_file))
                else:
                    self.set_prop(n, target_file)

            except:
                traceback.print_exc()
                errored = True

        # COPY cdxj, if exists
        source_key = self.CDXJ_KEY.format(rec=source.my_id)
        target_key = self.CDXJ_KEY.format(rec=self.my_id)

        self.redis.zunionstore(target_key, [source_key])

        # recreate pages, if any, in new recording
        source_coll = source.get_owner()
        source_pages = source_coll.list_rec_pages(source)
        collection.import_pages(source_pages, self)

        # COPY remote archives, if any
        self.redis.sunionstore(self.RA_KEY.format(rec=self.my_id),
                               self.RA_KEY.format(rec=source.my_id))

        # COPY recording warc keys
        self.redis.sunionstore(self.REC_WARC_KEY.format(rec=self.my_id),
                               self.REC_WARC_KEY.format(rec=source.my_id))

        # sync collection cdxj, if exists
        collection.sync_coll_index(exists=True, do_async=True)

        if not errored and delete_source:
            collection = source.get_owner()
            collection.remove_recording(source, delete=True)

        return not errored
示例#9
0
文件: zipnum.py 项目: Orbiter/pywb
class ZipNumCluster(CDXSource):
    DEFAULT_RELOAD_INTERVAL = 10  # in minutes
    DEFAULT_MAX_BLOCKS = 10

    def __init__(self, summary, config=None):
        self.max_blocks = self.DEFAULT_MAX_BLOCKS

        self.loc_resolver = None

        loc = None
        cookie_maker = None
        reload_ival = self.DEFAULT_RELOAD_INTERVAL

        if config:
            loc = config.get('shard_index_loc')
            cookie_maker = config.get('cookie_maker')

            self.max_blocks = config.get('max_blocks', self.max_blocks)

            reload_ival = config.get('reload_interval', reload_ival)


        if isinstance(loc, dict):
            self.loc_resolver = LocPrefixResolver(summary, loc)
        else:
            self.loc_resolver = LocMapResolver(summary, loc)

        self.summary = summary

        # reload interval
        self.loc_update_time = datetime.datetime.now()
        self.reload_interval = datetime.timedelta(minutes=reload_ival)

        self.blk_loader = BlockLoader(cookie_maker=cookie_maker)

#    @staticmethod
#    def reload_timed(timestamp, val, delta, func):
#        now = datetime.datetime.now()
#        if now - timestamp >= delta:
#            func()
#            return now
#        return None
#
#    def reload_loc(self):
#        reload_time = self.reload_timed(self.loc_update_time,
#                                        self.loc_map,
#                                        self.reload_interval,
#                                        self.load_loc)
#
#        if reload_time:
#            self.loc_update_time = reload_time

    def load_cdx(self, query):
        self.loc_resolver.load_loc()
        return self._do_load_cdx(self.summary, query)

    def _do_load_cdx(self, filename, query):
        reader = open(filename, 'rb')

        idx_iter = self.compute_page_range(reader, query)

        if query.secondary_index_only or query.page_count:
            return idx_iter

        blocks = self.idx_to_cdx(idx_iter, query)

        def gen_cdx():
            for blk in blocks:
                for cdx in blk:
                    yield cdx

        return gen_cdx()


    def _page_info(self, pages, pagesize, blocks):
        info = dict(pages=pages,
                    pageSize=pagesize,
                    blocks=blocks)
        return json.dumps(info) + '\n'

    def compute_page_range(self, reader, query):
        pagesize = query.page_size
        if not pagesize:
            pagesize = self.max_blocks
        else:
            pagesize = int(pagesize)

        last_line = None

        # Get End
        end_iter = search(reader, query.end_key, prev_size=1)

        try:
            end_line = end_iter.next()
        except StopIteration:
            last_line = read_last_line(reader)
            end_line = last_line

        # Get Start
        first_iter = iter_range(reader,
                                query.key,
                                query.end_key,
                                prev_size=1)

        try:
            first_line = first_iter.next()
        except StopIteration:
            if end_line == last_line and query.key >= last_line:
                first_line = last_line
            else:
                reader.close()
                if query.page_count:
                    yield self._page_info(0, pagesize, 0)
                    return
                else:
                    raise

        first = IDXObject(first_line)

        end = IDXObject(end_line)

        try:
            blocks = end['lineno'] - first['lineno']
            total_pages = blocks / pagesize + 1
        except:
            blocks = -1
            total_pages = 1

        if query.page_count:
            # same line, so actually need to look at cdx
            # to determine if it exists
            if blocks == 0:
                try:
                    block_cdx_iter = self.idx_to_cdx([first_line], query)
                    block = block_cdx_iter.next()
                    cdx = block.next()
                except StopIteration:
                    total_pages = 0
                    blocks = -1

            yield self._page_info(total_pages, pagesize, blocks + 1)
            reader.close()
            return

        curr_page = query.page
        if curr_page >= total_pages or curr_page < 0:
            msg = 'Page {0} invalid: First Page is 0, Last Page is {1}'
            reader.close()
            raise CDXException(msg.format(curr_page, total_pages - 1))

        startline = curr_page * pagesize
        endline = startline + pagesize - 1
        if blocks >= 0:
            endline = min(endline, blocks)

        if curr_page == 0:
            yield first_line
        else:
            startline -= 1

        idxiter = itertools.islice(first_iter, startline, endline)
        for idx in idxiter:
            yield idx

        reader.close()


    def search_by_line_num(self, reader, line):  # pragma: no cover
        def line_cmp(line1, line2):
            line1_no = int(line1.rsplit('\t', 1)[-1])
            line2_no = int(line2.rsplit('\t', 1)[-1])
            return cmp(line1_no, line2_no)

        line_iter = search(reader, line, compare_func=line_cmp)
        yield line_iter.next()

    def idx_to_cdx(self, idx_iter, query):
        blocks = None
        ranges = []

        for idx in idx_iter:
            idx = IDXObject(idx)

            if (blocks and blocks.part == idx['part'] and
                blocks.offset + blocks.length == idx['offset'] and
                blocks.count < self.max_blocks):

                    blocks.length += idx['length']
                    blocks.count += 1
                    ranges.append(idx['length'])

            else:
                if blocks:
                    yield self.block_to_cdx_iter(blocks, ranges, query)

                blocks = ZipBlocks(idx['part'],
                                   idx['offset'],
                                   idx['length'],
                                   1)

                ranges = [blocks.length]

        if blocks:
            yield self.block_to_cdx_iter(blocks, ranges, query)

    def block_to_cdx_iter(self, blocks, ranges, query):
        last_exc = None
        last_traceback = None

        try:
            locations = self.loc_resolver(blocks.part, query)
        except:
            raise Exception('No Locations Found for: ' + blocks.part)

        for location in self.loc_resolver(blocks.part, query):
            try:
                return self.load_blocks(location, blocks, ranges, query)
            except Exception as exc:
                last_exc = exc
                import sys
                last_traceback = sys.exc_info()[2]

        if last_exc:
            raise last_exc, None, last_traceback
        else:
            raise Exception('No Locations Found for: ' + blocks.part)

    def load_blocks(self, location, blocks, ranges, query):
        """ Load one or more blocks of compressed cdx lines, return
        a line iterator which decompresses and returns one line at a time,
        bounded by query.key and query.end_key
        """

        if (logging.getLogger().getEffectiveLevel() <= logging.DEBUG):
            msg = 'Loading {b.count} blocks from {loc}:{b.offset}+{b.length}'
            logging.debug(msg.format(b=blocks, loc=location))

        reader = self.blk_loader.load(location, blocks.offset, blocks.length)

        def decompress_block(range_):
            decomp = gzip_decompressor()
            buff = decomp.decompress(reader.read(range_))
            for line in BytesIO(buff):
                yield line

        iter_ = itertools.chain(*itertools.imap(decompress_block, ranges))

        # start bound
        iter_ = linearsearch(iter_, query.key)

        # end bound
        end = query.end_key
        iter_ = itertools.takewhile(lambda line: line < end, iter_)
        return iter_

    def __str__(self):
        return 'ZipNum Cluster: {0}, {1}'.format(self.summary,
                                                 self.loc_resolver)
示例#10
0
    def copy_data_from_recording(self, source, delete_source=False):
        """Copy given recording building block entries.

        :param RedisUniqueComponent source: building block
        :param bool delete_source: whether to delete source building block

        :returns: whether successful or not
        :rtype: bool
        """
        if self == source:
            return False

        if not self.is_open():
            return False

        errored = False

        self._copy_prop(source, 'title')
        self._copy_prop(source, 'desc')
        self._copy_prop(source, 'rec_type')
        self._copy_prop(source, 'recorded_at')
        #self._copy_prop(source, 'patch_rec')

        collection = self.get_owner()
        user = collection.get_owner()

        target_dirname = user.get_user_temp_warc_path()
        target_warc_key = self.COLL_WARC_KEY.format(coll=collection.my_id)

        # Copy WARCs
        loader = BlockLoader()

        for n, url in source.iter_all_files(include_index=True):
            local_filename = n + '.' + timestamp20_now()
            target_file = os.path.join(target_dirname, local_filename)

            src = loader.load(url)

            try:
                with open(target_file, 'wb') as dest:
                    print('Copying {0} -> {1}'.format(url, target_file))
                    shutil.copyfileobj(src, dest)
                    size = dest.tell()

                if n != self.INDEX_FILE_KEY:
                    self.incr_size(size)
                    self.redis.hset(target_warc_key, n,
                                    add_local_store_prefix(target_file))
                else:
                    self.set_prop(n, target_file)

            except:
                import traceback
                traceback.print_exc()
                errored = True

        # COPY cdxj, if exists
        source_key = self.CDXJ_KEY.format(rec=source.my_id)
        target_key = self.CDXJ_KEY.format(rec=self.my_id)

        self.redis.zunionstore(target_key, [source_key])

        # recreate pages, if any, in new recording
        source_coll = source.get_owner()
        source_pages = source_coll.list_rec_pages(source)
        collection.import_pages(source_pages, self)

        # COPY remote archives, if any
        self.redis.sunionstore(self.RA_KEY.format(rec=self.my_id),
                               self.RA_KEY.format(rec=source.my_id))

        # COPY recording warc keys
        self.redis.sunionstore(self.REC_WARC_KEY.format(rec=self.my_id),
                               self.REC_WARC_KEY.format(rec=source.my_id))

        # sync collection cdxj, if exists
        collection.sync_coll_index(exists=True, do_async=True)

        if not errored and delete_source:
            collection = source.get_owner()
            collection.remove_recording(source, delete=True)

        return not errored
示例#11
0
文件: zipnum.py 项目: ikreymer/pywb
class ZipNumIndexSource(BaseIndexSource):
    DEFAULT_RELOAD_INTERVAL = 10  # in minutes
    DEFAULT_MAX_BLOCKS = 10
    IDX_EXT = ('.idx', '.summary')

    def __init__(self, summary, config=None):
        self.max_blocks = self.DEFAULT_MAX_BLOCKS

        self.loc_resolver = None
        self.config = config or {}

        loc = None
        cookie_maker = None
        reload_ival = self.DEFAULT_RELOAD_INTERVAL

        if config:
            loc = config.get('shard_index_loc')
            cookie_maker = config.get('cookie_maker')

            self.max_blocks = config.get('max_blocks', self.max_blocks)

            reload_ival = config.get('reload_interval', reload_ival)

        if isinstance(loc, dict):
            self.loc_resolver = LocPrefixResolver(summary, loc)
        else:
            self.loc_resolver = LocMapResolver(summary, loc)

        self.summary = summary

        # reload interval
        self.loc_update_time = datetime.datetime.now()
        self.reload_interval = datetime.timedelta(minutes=reload_ival)

        self.blk_loader = BlockLoader(cookie_maker=cookie_maker)

    def load_index(self, params):
        self.loc_resolver.load_loc()
        return self._do_load_cdx(self.summary, CDXQuery(params))

    def _do_load_cdx(self, filename, query):
        reader = open(filename, 'rb')

        idx_iter = self.compute_page_range(reader, query)

        if query.secondary_index_only:
            def gen_idx():
                for idx in idx_iter:
                    yield IDXObject(idx)

            return gen_idx()

        if query.page_count:
            return idx_iter

        blocks = self.idx_to_cdx(idx_iter, query)

        def gen_cdx():
            for blk in blocks:
                for cdx in blk:
                    yield CDXObject(cdx)

        return gen_cdx()

    def _page_info(self, pages, pagesize, blocks):
        info = AlwaysJsonResponse(
                    pages=pages,
                    pageSize=pagesize,
                    blocks=blocks)

        return info

    def compute_page_range(self, reader, query):
        pagesize = query.page_size
        if not pagesize:
            pagesize = self.max_blocks
        else:
            pagesize = int(pagesize)

        last_line = None

        # Get End
        end_iter = search(reader, query.end_key, prev_size=1)

        try:
            end_line = six.next(end_iter)
        except StopIteration:
            last_line = read_last_line(reader)
            end_line = last_line

        # Get Start
        first_iter = iter_range(reader,
                                query.key,
                                query.end_key,
                                prev_size=1)

        try:
            first_line = six.next(first_iter)
        except StopIteration:
            if end_line == last_line and query.key >= last_line:
                first_line = last_line
            else:
                reader.close()
                if query.page_count:
                    yield self._page_info(0, pagesize, 0)
                return

        first = IDXObject(first_line)

        end = IDXObject(end_line)

        try:
            blocks = end['lineno'] - first['lineno']
            total_pages = int(blocks / pagesize) + 1
        except:
            blocks = -1
            total_pages = 1

        if query.page_count:
            # same line, so actually need to look at cdx
            # to determine if it exists
            if blocks == 0:
                try:
                    block_cdx_iter = self.idx_to_cdx([first_line], query)
                    block = six.next(block_cdx_iter)
                    cdx = six.next(block)
                except StopIteration:
                    total_pages = 0
                    blocks = -1

            yield self._page_info(total_pages, pagesize, blocks + 1)
            reader.close()
            return

        curr_page = query.page
        if curr_page >= total_pages or curr_page < 0:
            msg = 'Page {0} invalid: First Page is 0, Last Page is {1}'
            reader.close()
            raise CDXException(msg.format(curr_page, total_pages - 1))

        startline = curr_page * pagesize
        endline = startline + pagesize - 1
        if blocks >= 0:
            endline = min(endline, blocks)

        if curr_page == 0:
            yield first_line
        else:
            startline -= 1

        idxiter = itertools.islice(first_iter, startline, endline)
        for idx in idxiter:
            yield idx

        reader.close()


    def search_by_line_num(self, reader, line):  # pragma: no cover
        def line_cmp(line1, line2):
            line1_no = int(line1.rsplit(b'\t', 1)[-1])
            line2_no = int(line2.rsplit(b'\t', 1)[-1])
            return cmp(line1_no, line2_no)

        line_iter = search(reader, line, compare_func=line_cmp)
        yield six.next(line_iter)

    def idx_to_cdx(self, idx_iter, query):
        blocks = None
        ranges = []

        for idx in idx_iter:
            idx = IDXObject(idx)

            if (blocks and blocks.part == idx['part'] and
                blocks.offset + blocks.length == idx['offset'] and
                blocks.count < self.max_blocks):

                    blocks.length += idx['length']
                    blocks.count += 1
                    ranges.append(idx['length'])

            else:
                if blocks:
                    yield self.block_to_cdx_iter(blocks, ranges, query)

                blocks = ZipBlocks(idx['part'],
                                   idx['offset'],
                                   idx['length'],
                                   1)

                ranges = [blocks.length]

        if blocks:
            yield self.block_to_cdx_iter(blocks, ranges, query)

    def block_to_cdx_iter(self, blocks, ranges, query):
        last_exc = None
        last_traceback = None

        try:
            locations = self.loc_resolver(blocks.part, query)
        except:
            raise Exception('No Locations Found for: ' + blocks.part)

        for location in self.loc_resolver(blocks.part, query):
            try:
                return self.load_blocks(location, blocks, ranges, query)
            except Exception as exc:
                last_exc = exc
                import sys
                last_traceback = sys.exc_info()[2]

        if last_exc:
            six.reraise(Exception, last_exc, last_traceback)
            #raise last_exc
        else:
            raise Exception('No Locations Found for: ' + blocks.part)

    def load_blocks(self, location, blocks, ranges, query):
        """ Load one or more blocks of compressed cdx lines, return
        a line iterator which decompresses and returns one line at a time,
        bounded by query.key and query.end_key
        """
        if (logging.getLogger().getEffectiveLevel() <= logging.DEBUG):
            msg = 'Loading {b.count} blocks from {loc}:{b.offset}+{b.length}'
            logging.debug(msg.format(b=blocks, loc=location))

        reader = self.blk_loader.load(location, blocks.offset, blocks.length)

        def decompress_block(range_):
            decomp = gzip_decompressor()
            buff = decomp.decompress(reader.read(range_))
            for line in BytesIO(buff):
                yield line

        def iter_blocks(reader):
            try:
                for r in ranges:
                    yield decompress_block(r)
            finally:
                reader.close()

        # iterate over all blocks
        iter_ = itertools.chain.from_iterable(iter_blocks(reader))

        # start bound
        iter_ = linearsearch(iter_, query.key)

        # end bound
        iter_ = itertools.takewhile(lambda line: line < query.end_key, iter_)
        return iter_

    def __repr__(self):
        return 'ZipNumIndexSource({0}, {1})'.format(self.summary, self.config)

    def __str__(self):
        return 'zipnum'

    def __eq__(self, other):
        if not isinstance(other, self.__class__):
            return False

        return (self.summary == other.summary and
                self.config == other.config)

    @classmethod
    def init_from_string(cls, value):
        is_zipnum = False
        if value.startswith('zipnum+'):
            value = value[7:]
            is_zipnum = True

        if value.startswith('file://'):
            value = value[7:]

        if is_zipnum or value.endswith(cls.IDX_EXT):
            return cls(value, None)

    @classmethod
    def init_from_config(cls, config):
        if config['type'] != 'zipnum':
            return

        return cls(config['path'], config)
示例#12
0
class ZipNumCluster(CDXSource):
    DEFAULT_RELOAD_INTERVAL = 10  # in minutes
    DEFAULT_MAX_BLOCKS = 10

    def __init__(self, summary, config=None):
        self.max_blocks = self.DEFAULT_MAX_BLOCKS

        self.loc_resolver = None

        loc = None
        cookie_maker = None
        reload_ival = self.DEFAULT_RELOAD_INTERVAL

        if config:
            loc = config.get('shard_index_loc')
            cookie_maker = config.get('cookie_maker')

            self.max_blocks = config.get('max_blocks', self.max_blocks)

            reload_ival = config.get('reload_interval', reload_ival)

        if isinstance(loc, dict):
            self.loc_resolver = LocPrefixResolver(summary, loc)
        else:
            self.loc_resolver = LocMapResolver(summary, loc)

        self.summary = summary

        # reload interval
        self.loc_update_time = datetime.datetime.now()
        self.reload_interval = datetime.timedelta(minutes=reload_ival)

        self.blk_loader = BlockLoader(cookie_maker=cookie_maker)

#    @staticmethod
#    def reload_timed(timestamp, val, delta, func):
#        now = datetime.datetime.now()
#        if now - timestamp >= delta:
#            func()
#            return now
#        return None
#
#    def reload_loc(self):
#        reload_time = self.reload_timed(self.loc_update_time,
#                                        self.loc_map,
#                                        self.reload_interval,
#                                        self.load_loc)
#
#        if reload_time:
#            self.loc_update_time = reload_time

    def load_cdx(self, query):
        self.loc_resolver.load_loc()

        reader = open(self.summary, 'rb')

        idx_iter = self.compute_page_range(reader, query)

        if query.secondary_index_only or query.page_count:
            return idx_iter

        blocks = self.idx_to_cdx(idx_iter, query)

        def gen_cdx():
            for blk in blocks:
                for cdx in blk:
                    yield cdx

        return gen_cdx()

    def compute_page_range(self, reader, query):

        # Get End
        end_iter = search(reader, query.end_key, prev_size=1)

        try:
            end_line = end_iter.next()
        except StopIteration:
            end_line = read_last_line(reader)

        # Get Start

        first_iter = iter_range(reader, query.key, query.end_key, prev_size=1)

        try:
            first_line = first_iter.next()
        except StopIteration:
            reader.close()
            raise

        first = IDXObject(first_line)

        end = IDXObject(end_line)
        diff = end['lineno'] - first['lineno']

        pagesize = query.page_size
        if not pagesize:
            pagesize = self.max_blocks

        total_pages = diff / pagesize + 1

        if query.page_count:
            info = dict(pages=total_pages, pageSize=pagesize, blocks=diff + 1)
            yield json.dumps(info)
            reader.close()
            return

        curr_page = query.page
        if curr_page >= total_pages or curr_page < 0:
            msg = 'Page {0} invalid: First Page is 0, Last Page is {1}'
            reader.close()
            raise CDXException(msg.format(curr_page, total_pages - 1))

        startline = curr_page * pagesize
        endline = min(startline + pagesize - 1, diff)

        if curr_page == 0:
            yield first_line
        else:
            startline -= 1

        idxiter = itertools.islice(first_iter, startline, endline)
        for idx in idxiter:
            yield idx

        reader.close()

    def search_by_line_num(self, reader, line):  # pragma: no cover
        def line_cmp(line1, line2):
            line1_no = int(line1.rsplit('\t', 1)[-1])
            line2_no = int(line2.rsplit('\t', 1)[-1])
            return cmp(line1_no, line2_no)

        line_iter = search(reader, line, compare_func=line_cmp)
        yield line_iter.next()

    def idx_to_cdx(self, idx_iter, query):
        blocks = None
        ranges = []

        for idx in idx_iter:
            idx = IDXObject(idx)

            if (blocks and blocks.part == idx['part']
                    and blocks.offset + blocks.length == idx['offset']
                    and blocks.count < self.max_blocks):

                blocks.length += idx['length']
                blocks.count += 1
                ranges.append(idx['length'])

            else:
                if blocks:
                    yield self.block_to_cdx_iter(blocks, ranges, query)

                blocks = ZipBlocks(idx['part'], idx['offset'], idx['length'],
                                   1)

                ranges = [blocks.length]

        if blocks:
            yield self.block_to_cdx_iter(blocks, ranges, query)

    def block_to_cdx_iter(self, blocks, ranges, query):
        last_exc = None
        last_traceback = None

        for location in self.loc_resolver(blocks.part):
            try:
                return self.load_blocks(location, blocks, ranges, query)
            except Exception as exc:
                last_exc = exc
                import sys
                last_traceback = sys.exc_info()[2]

        if last_exc:
            raise exc, None, last_traceback
        else:
            raise Exception('No Locations Found for: ' + block.part)

    def load_blocks(self, location, blocks, ranges, query):
        """ Load one or more blocks of compressed cdx lines, return
        a line iterator which decompresses and returns one line at a time,
        bounded by query.key and query.end_key
        """

        if (logging.getLogger().getEffectiveLevel() <= logging.DEBUG):
            msg = 'Loading {b.count} blocks from {loc}:{b.offset}+{b.length}'
            logging.debug(msg.format(b=blocks, loc=location))

        reader = self.blk_loader.load(location, blocks.offset, blocks.length)

        def decompress_block(range_):
            decomp = gzip_decompressor()
            buff = decomp.decompress(reader.read(range_))
            for line in BytesIO(buff):
                yield line

        iter_ = itertools.chain(*itertools.imap(decompress_block, ranges))

        # start bound
        iter_ = linearsearch(iter_, query.key)

        # end bound
        end = query.end_key
        iter_ = itertools.takewhile(lambda line: line < end, iter_)
        return iter_

    def __str__(self):
        return 'ZipNum Cluster: {0}, {1}'.format(self.summary,
                                                 self.loc_resolver)
示例#13
0
class ZipNumCluster(CDXSource):
    DEFAULT_RELOAD_INTERVAL = 10  # in minutes
    DEFAULT_MAX_BLOCKS = 50

    def __init__(self, summary, config=None):

        loc = None
        cookie_maker = None
        self.max_blocks = self.DEFAULT_MAX_BLOCKS
        reload_ival = self.DEFAULT_RELOAD_INTERVAL

        if config:
            loc = config.get('zipnum_loc')
            cookie_maker = config.get('cookie_maker')

            self.max_blocks = config.get('max_blocks', self.max_blocks)

            reload_ival = config.get('reload_interval', reload_ival)

        if not loc:
            splits = os.path.splitext(summary)
            loc = splits[0] + '.loc'

        self.summary = summary
        self.loc_filename = loc

        # initial loc map
        self.loc_map = {}
        self.loc_mtime = 0
        self.load_loc()

        # reload interval
        self.loc_update_time = datetime.datetime.now()
        self.reload_interval = datetime.timedelta(minutes=reload_ival)

        self.blk_loader = BlockLoader(cookie_maker=cookie_maker)

    def load_loc(self):
        # check modified time of current file before loading
        new_mtime = os.path.getmtime(self.loc_filename)
        if (new_mtime == self.loc_mtime):
            return

        # update loc file mtime
        self.loc_mtime = new_mtime

        logging.debug('Loading loc from: ' + self.loc_filename)
        with open(self.loc_filename) as fh:
            for line in fh:
                parts = line.rstrip().split('\t')
                self.loc_map[parts[0]] = parts[1:]

#    @staticmethod
#    def reload_timed(timestamp, val, delta, func):
#        now = datetime.datetime.now()
#        if now - timestamp >= delta:
#            func()
#            return now
#        return None
#
#    def reload_loc(self):
#        reload_time = self.reload_timed(self.loc_update_time,
#                                        self.loc_map,
#                                        self.reload_interval,
#                                        self.load_loc)
#
#        if reload_time:
#            self.loc_update_time = reload_time

    def lookup_loc(self, part):
        return self.loc_map[part]

    def load_cdx(self, query):
        self.load_loc()

        reader = open(self.summary)

        idx_iter = iter_range(reader,
                              query.key,
                              query.end_key,
                              prev_size=1)

        if query.secondary_index_only:
            return idx_iter
        else:
            blocks = self.idx_to_cdx(idx_iter, query)

            def gen_cdx():
                for blk in blocks:
                    for cdx in blk:
                        yield cdx

            return gen_cdx()

    def idx_to_cdx(self, idx_iter, query):
        blocks = None
        ranges = []

        for idx in idx_iter:
            idx = IDXObject(idx)

            if (blocks and blocks.part == idx['part'] and
                blocks.offset + blocks.length == idx['offset'] and
                blocks.count < self.max_blocks):

                    blocks.length += idx['length']
                    blocks.count += 1
                    ranges.append(idx['length'])

            else:
                if blocks:
                    yield self.block_to_cdx_iter(blocks, ranges, query)

                blocks = ZipBlocks(idx['part'],
                                   idx['offset'],
                                   idx['length'],
                                   1)

                ranges = [blocks.length]

        if blocks:
            yield self.block_to_cdx_iter(blocks, ranges, query)

    def block_to_cdx_iter(self, blocks, ranges, query):
        last_exc = None
        last_traceback = None

        for location in self.lookup_loc(blocks.part):
            try:
                return self.load_blocks(location, blocks, ranges, query)
            except Exception as exc:
                last_exc = exc
                import sys
                last_traceback = sys.exc_info()[2]

        if last_exc:
            raise exc, None, last_traceback
        else:
            raise Exception('No Locations Found for: ' + block.part)

    def load_blocks(self, location, blocks, ranges, query):

        if (logging.getLogger().getEffectiveLevel() <= logging.DEBUG):
            msg = 'Loading {b.count} blocks from {loc}:{b.offset}+{b.length}'
            logging.debug(msg.format(b=blocks, loc=location))

        reader = self.blk_loader.load(location, blocks.offset, blocks.length)

        def decompress_block(range_):
            decomp = gzip_decompressor()
            buff = decomp.decompress(reader.read(range_))
            return readline_to_iter(BytesIO(buff))

        iter_ = itertools.chain(*itertools.imap(decompress_block, ranges))

        # start bound
        iter_ = linearsearch(iter_, query.key)

        # end bound
        end = query.end_key
        iter_ = itertools.takewhile(lambda line: line < end, iter_)
        return iter_

    def __str__(self):
        return 'ZipNum Cluster: {0}, {1}'.format(self.summary,
                                                 self.loc_filename)