Python BlockLoader.BlockLoader示例，pywb.utils.loaders.BlockLoader.BlockLoader Python示例

示例#1

0

显示文件

文件： archivereplayview.py 项目： lorz54/test

    def load_archive_info_xml(self, url):
        self.archive_infos = {}
        url = os.path.expandvars(url)
        logging.debug('Loading XML from {0}'.format(url))
        if not url:
            return

        try:
            stream = BlockLoader().load(url)
        except Exception as e:
            logging.debug(e)
            logging.debug('Proceeding without xml archive info')
            return

        root = ElementTree.fromstring(stream.read())

        for link in root.findall('link'):
            name = link.get('id')
            longname = link.get('longname')
            archive = link.find('archive')
            timegate = link.find('timegate')

            if timegate is None or archive is None:
                continue

            rewritten = (archive.get('rewritten-urls') == 'yes')
            unrewritten_url = archive.get('un-rewritten-api-url', '')
            uri = timegate.get('uri')

            self.archive_infos[name] = {
                'uri': uri,
                'rewritten': rewritten,
                'unrewritten_url': unrewritten_url,
                'name': longname
            }

示例#2

0

显示文件

文件： archivereplayview.py 项目： lorz54/test

    def load_archive_info_json(self, url):
        self.archive_infos = {}
        url = os.path.expandvars(url)
        logging.debug('Loading XML from {0}'.format(url))
        if not url:
            return

        try:
            stream = BlockLoader().load(url)
        except Exception as e:
            logging.debug(e)
            logging.debug('Proceeding without json archive info')
            return

        archives = json.loads(stream.read())
        for arc in archives:
            id_ = arc['id']
            name = arc['name']
            uri = arc['timegate']
            base_url = arc.get('base_url', uri)
            unrewritten_url = arc.get('unrewritten_url')
            if not unrewritten_url:
                unrewritten_url = base_url + '{timestamp}id_/{url}'

            self.archive_infos[id_] = {
                'id': id_,
                'uri': uri,
                'name': name,
                'base_url': base_url,
                'unrewritten_url': unrewritten_url
            }

示例#3

0

显示文件

    def __init__(self, summary, config=None):

        loc = None
        cookie_maker = None
        self.max_blocks = self.DEFAULT_MAX_BLOCKS
        reload_ival = self.DEFAULT_RELOAD_INTERVAL

        if config:
            loc = config.get('zipnum_loc')
            cookie_maker = config.get('cookie_maker')

            self.max_blocks = config.get('max_blocks', self.max_blocks)

            reload_ival = config.get('reload_interval', reload_ival)

        if not loc:
            splits = os.path.splitext(summary)
            loc = splits[0] + '.loc'

        self.summary = summary
        self.loc_filename = loc

        # initial loc map
        self.loc_map = {}
        self.loc_mtime = 0
        self.load_loc()

        # reload interval
        self.loc_update_time = datetime.datetime.now()
        self.reload_interval = datetime.timedelta(minutes=reload_ival)

        self.blk_loader = BlockLoader(cookie_maker=cookie_maker)

示例#4

0

显示文件

文件： zipnum.py 项目： whitten/pywb

    def __init__(self, summary, config=None):
        self.max_blocks = self.DEFAULT_MAX_BLOCKS

        self.loc_resolver = None
        self.config = config or {}

        loc = None
        cookie_maker = None
        reload_ival = self.DEFAULT_RELOAD_INTERVAL

        if config:
            loc = config.get('shard_index_loc')
            cookie_maker = config.get('cookie_maker')

            self.max_blocks = config.get('max_blocks', self.max_blocks)

            reload_ival = config.get('reload_interval', reload_ival)

        if isinstance(loc, dict):
            self.loc_resolver = LocPrefixResolver(summary, loc)
        else:
            self.loc_resolver = LocMapResolver(summary, loc)

        self.summary = summary

        # reload interval
        self.loc_update_time = datetime.datetime.now()
        self.reload_interval = datetime.timedelta(minutes=reload_ival)

        self.blk_loader = BlockLoader(cookie_maker=cookie_maker)

示例#5

0

显示文件

文件： test_loaders.py 项目： mirrorweb/pywb

def test_s3_read_2():
    pytest.importorskip('boto3')

    res = BlockLoader().load('s3://commoncrawl/crawl-data/CC-MAIN-2015-11/index.html')

    buff = res.read()
    assert len(buff) == 2082

    reader = DecompressingBufferedReader(BytesIO(buff))
    assert reader.readline() == b'<!DOCTYPE html>\n'

示例#6

0

显示文件

文件： blockrecordloader.py 项目： mirrorweb/pywb

    def __init__(self,
                 loader=None,
                 cookie_maker=None,
                 block_size=BUFF_SIZE,
                 *args,
                 **kwargs):
        if not loader:
            loader = BlockLoader(cookie_maker=cookie_maker)

        self.loader = loader
        self.block_size = block_size
        super(BlockArcWarcRecordLoader, self).__init__(*args, **kwargs)

示例#7

0

显示文件

    def copy_rec_files(self, user, collection, recording, warc_files):
        if self.dry_run:
            target_dirname = os.path.join('/tmp/migrate4.0', collection.my_id)
        else:
            target_dirname = user.get_user_temp_warc_path()

        os.makedirs(target_dirname, exist_ok=True)
        print('Writing to dir: ' + target_dirname)

        coll_warc_key = recording.COLL_WARC_KEY.format(coll=collection.my_id)
        rec_warc_key = recording.REC_WARC_KEY.format(rec=recording.my_id)

        # Copy WARCs
        loader = BlockLoader()
        total_size = 0

        for n, url in warc_files.items():
            if not url.startswith('s3://'):
                print('FILE ERR: Skipping local file: ' + url)
                continue

            local_filename = n if n != recording.INDEX_FILE_KEY else os.path.basename(
                url)
            target_file = os.path.join(target_dirname, local_filename)

            src = loader.load(url)

            try:
                with open(target_file, 'wb') as dest:
                    print('Copying {0} -> {1}'.format(url, target_file))
                    shutil.copyfileobj(src, dest)
                    size = dest.tell()

                target_file = add_local_store_prefix(target_file)
                if n != recording.INDEX_FILE_KEY:
                    self.redis.hset(coll_warc_key, n, target_file)
                    self.redis.sadd(rec_warc_key, n)
                    total_size += size
                else:
                    recording.set_prop(n, target_file, update_ts=False)

                if self.dry_run:
                    os.remove(strip_prefix(target_file))

            except:
                import traceback
                traceback.print_exc()

        # commit from temp dir to storage
        if not self.dry_run:
            recording.commit_to_storage()

        return total_size

示例#8

0

显示文件

    def fetch_local_file(self, uri):
        #fh = open(uri)
        fh = BlockLoader().load_file_or_resource(uri)

        content_type, _ = mimetypes.guess_type(uri)

        # create fake headers for local file
        status_headers = StatusAndHeaders('200 OK',
                                          [('Content-Type', content_type)])
        stream = fh

        return (status_headers, stream)

示例#9

0

显示文件

def test_s3_read_1():
    pytest.importorskip('boto')

    res = BlockLoader().load('s3://commoncrawl/crawl-data/CC-MAIN-2015-11/segments/1424936462700.28/warc/CC-MAIN-20150226074102-00159-ip-10-28-5-156.ec2.internal.warc.gz',
                             offset=53235662,
                             length=2526)

    buff = res.read()
    assert len(buff) == 2526

    reader = DecompressingBufferedReader(BytesIO(buff))
    assert reader.readline() == b'WARC/1.0\r\n'
    assert reader.readline() == b'WARC-Type: response\r\n'

示例#10

0

显示文件

    def __init__(self, loader=None, cookie_maker=None, block_size=8192):
        if not loader:
            loader = BlockLoader(cookie_maker)

        self.loader = loader
        self.block_size = block_size

        self.arc_parser = ARCHeadersParser(self.ARC_HEADERS)

        self.warc_parser = StatusAndHeadersParser(self.WARC_TYPES)
        self.http_parser = StatusAndHeadersParser(self.HTTP_TYPES)

        self.http_req_parser = StatusAndHeadersParser(self.HTTP_VERBS)

示例#11

0

显示文件

    def get_checksum_and_size(self, filepath_or_url):
        """Returns the checksum of the supplied URL or filepath and the size of the resource

        :param str filepath_or_url: The URL or filepath to the resource that the checksum and size is desired for
        :return: A three tuple containing the kind of checksum, the checksum itself, and size
        :rtype: tuple[str|None, str|None, int|None]
        """
        m = hashlib.md5()
        amount = 1024 * 1024
        total_size = 0
        with closing(BlockLoader().load(filepath_or_url)) as f:
            while True:
                chunk = f.read(amount)
                chunk_size = len(chunk)
                if chunk_size == 0:
                    break
                total_size += chunk_size
                m.update(chunk)

        return 'md5', m.hexdigest(), total_size

示例#12

0

显示文件

    def __init__(self,
                 loader=None,
                 cookie_maker=None,
                 block_size=8192,
                 verify_http=True,
                 arc2warc=True):
        if not loader:
            loader = BlockLoader(cookie_maker=cookie_maker)

        self.loader = loader
        self.block_size = block_size

        if arc2warc:
            self.arc_parser = ARC2WARCHeadersParser()
        else:
            self.arc_parser = ARCHeadersParser()

        self.warc_parser = StatusAndHeadersParser(self.WARC_TYPES)
        self.http_parser = StatusAndHeadersParser(self.HTTP_TYPES, verify_http)

        self.http_req_parser = StatusAndHeadersParser(self.HTTP_VERBS,
                                                      verify_http)

示例#13

0

显示文件

    def wasapi_download(self, username, coll_name, filename):
        user = self._get_wasapi_user(username)

        if not user:
            self._raise_error(404, 'no_such_user')

        collection = user.get_collection_by_name(coll_name)

        if not collection:
            self._raise_error(404, 'no_such_collection')

        # self.access.assert_is_curr_user(user)
        # only users with write access can use wasapi
        self.access.assert_can_write_coll(collection)

        warc_key = collection.get_warc_key()
        warc_path = self.redis.hget(warc_key, filename)

        if not warc_path:
            self._raise_error(404, 'file_not_found')

        response.headers['Content-Type'] = 'application/octet-stream'
        response.headers[
            'Content-Disposition'] = "attachment; filename*=UTF-8''" + filename
        response.headers['Transfer-Encoding'] = 'chunked'

        loader = BlockLoader()
        fh = None
        try:
            fh = loader.load(warc_path)
        except Exception:
            self._raise_error(400, 'file_load_error')

        def read_all(fh):
            for chunk in StreamIter(fh):
                yield chunk

        return read_all(fh)

示例#14

0

显示文件

文件： handlers.py 项目： tilgovi/pywb

    def __init__(self, static_path):
        mimetypes.init()

        self.static_path = static_path
        self.block_loader = BlockLoader()

示例#15

0

显示文件

文件： test_loaders.py 项目： mirrorweb/pywb

def test_mock_webhdfs_load_2():
    expected = 'http://remote-host/webhdfs/v1/some/file.warc.gz?op=OPEN&offset=10'
    with patch('pywb.utils.loaders.HttpLoader.load', mock_load(expected)):
        res = BlockLoader().load('webhdfs://remote-host/some/file.warc.gz', 10, -1)

示例#16

0

显示文件

文件： test_loaders.py 项目： mirrorweb/pywb

def test_mock_webhdfs_load_3_username():
    os.environ['WEBHDFS_USER'] = '******'
    expected = 'http://remote-host/webhdfs/v1/some/file.warc.gz?op=OPEN&offset=10&user.name=someuser'
    with patch('pywb.utils.loaders.HttpLoader.load', mock_load(expected)):
        res = BlockLoader().load('webhdfs://remote-host/some/file.warc.gz', 10, -1)

示例#17

0

显示文件

    def handle_download(self, user, coll_name, recs):
        user, collection = self.user_manager.get_user_coll(user, coll_name)

        if not collection:
            self._raise_error(404, 'no_such_collection')

        if not self.access.is_superuser():
            self.access.assert_can_write_coll(collection)

        # collection['uid'] = coll
        collection.load()

        Stats(self.redis).incr_download(collection)

        now = timestamp_now()

        name = coll_name
        if recs != '*':
            rec_list = recs.split(',')
            if len(rec_list) == 1:
                name = recs
            else:
                name += '-' + recs
        else:
            rec_list = None

        filename = self.download_filename.format(title=quote(name),
                                                 timestamp=now)
        loader = BlockLoader()

        coll_info = self.create_coll_warcinfo(user, collection, filename)

        def iter_infos():
            for recording in collection.get_recordings(load=True):
                if rec_list and recording.name not in rec_list:
                    continue

                warcinfo = self.create_rec_warcinfo(user, collection,
                                                    recording, filename)

                size = len(warcinfo)
                size += recording.size
                yield recording, warcinfo, size

        def read_all(infos):
            yield coll_info

            for recording, warcinfo, _ in infos:
                yield warcinfo

                for n, warc_path in recording.iter_all_files():
                    try:
                        fh = loader.load(warc_path)
                    except Exception:
                        print('Skipping invalid ' + warc_path)
                        continue

                    for chunk in StreamIter(fh):
                        yield chunk

        response.headers['Content-Type'] = 'application/octet-stream'
        response.headers[
            'Content-Disposition'] = "attachment; filename*=UTF-8''" + filename

        # if not transfer-encoding, store infos and calculate total size
        if not self.download_chunk_encoded:
            size = len(coll_info)
            infos = list(iter_infos())
            size += sum(size for r, i, size in infos)

            response.headers['Content-Length'] = size
            return read_all(infos)

        else:
            # stream everything
            response.headers['Transfer-Encoding'] = 'chunked'

            return read_all(iter_infos())

示例#18

0

显示文件

文件： test_loaders.py 项目： mirrorweb/pywb

def test_mock_webhdfs_load_4_token():
    os.environ['WEBHDFS_USER'] = ''
    os.environ['WEBHDFS_TOKEN'] = 'ATOKEN'
    expected = 'http://remote-host/webhdfs/v1/some/file.warc.gz?op=OPEN&offset=10&delegation=ATOKEN'
    with patch('pywb.utils.loaders.HttpLoader.load', mock_load(expected)):
        res = BlockLoader().load('webhdfs://remote-host/some/file.warc.gz', 10, -1)

示例#19

0

显示文件

    def handle_download(self, user, coll, rec):
        collection = self.manager.get_collection(user, coll, rec)
        if not collection:
            self._raise_error(404, 'Collection not found', id=coll)

        now = timestamp_now()

        name = collection['id']
        if rec != '*':
            rec_list = rec.split(',')
            if len(rec_list) == 1:
                name = rec
            else:
                name += '-' + rec
        else:
            rec_list = None

        filename = self.download_filename.format(title=quote(name),
                                                 timestamp=now)
        loader = BlockLoader()

        coll_info = self.create_coll_warcinfo(user, collection, filename)

        def iter_infos():
            for recording in collection['recordings']:
                if rec_list and recording['id'] not in rec_list:
                    continue

                warcinfo = self.create_rec_warcinfo(user, collection,
                                                    recording, filename)

                size = len(warcinfo)
                size += recording['size']
                yield recording, warcinfo, size

        def read_all(infos):
            yield coll_info

            for recording, warcinfo, _ in infos:
                yield warcinfo

                for warc_path in self._iter_all_warcs(user, coll,
                                                      recording['id']):
                    try:
                        fh = loader.load(warc_path)
                    except:
                        print('Skipping invalid ' + warc_path)
                        continue

                    for chunk in StreamIter(fh):
                        yield chunk

        response.headers['Content-Type'] = 'application/octet-stream'
        response.headers[
            'Content-Disposition'] = "attachment; filename*=UTF-8''" + filename

        # if not transfer-encoding, store infos and calculate total size
        if not self.download_chunk_encoded:
            size = len(coll_info)
            infos = list(iter_infos())
            size += sum(size for r, i, size in infos)

            response.headers['Content-Length'] = size
            return read_all(infos)

        else:
            # stream everything
            response.headers['Transfer-Encoding'] = 'chunked'

            return read_all(iter_infos())

示例#20

0

显示文件

def test_err_unknown_loader():
    # unknown loader error
    with pytest.raises(IOError):
        BlockLoader().load('foo://example.com', 10).read()

示例#21

0

显示文件

def test_err_no_such_file():
    # no such file
    with pytest.raises(IOError):
        len(BlockLoader().load('_x_no_such_file_', 0, 100).read('400'))

示例#22

0

显示文件

文件： recording.py 项目： georgerobescu/webrecorder

    def copy_data_from_recording(self, source, delete_source=False):
        """Copy given recording building block entries.

        :param RedisUniqueComponent source: building block
        :param bool delete_source: whether to delete source building block

        :returns: whether successful or not
        :rtype: bool
        """
        if self == source:
            return False

        if not self.is_open():
            return False

        errored = False

        self._copy_prop(source, 'title')
        self._copy_prop(source, 'desc')
        self._copy_prop(source, 'rec_type')
        self._copy_prop(source, 'recorded_at')
        #self._copy_prop(source, 'patch_rec')

        collection = self.get_owner()
        user = collection.get_owner()

        target_dirname = user.get_user_temp_warc_path()
        target_warc_key = self.COLL_WARC_KEY.format(coll=collection.my_id)

        # Copy WARCs
        loader = BlockLoader()

        for n, url in source.iter_all_files(include_index=True):
            local_filename = n + '.' + timestamp20_now()
            target_file = os.path.join(target_dirname, local_filename)

            src = loader.load(url)

            try:
                with open(target_file, 'wb') as dest:
                    print('Copying {0} -> {1}'.format(url, target_file))
                    shutil.copyfileobj(src, dest)
                    size = dest.tell()

                if n != self.INDEX_FILE_KEY:
                    self.incr_size(size)
                    self.redis.hset(target_warc_key, n,
                                    add_local_store_prefix(target_file))
                else:
                    self.set_prop(n, target_file)

            except:
                import traceback
                traceback.print_exc()
                errored = True

        # COPY cdxj, if exists
        source_key = self.CDXJ_KEY.format(rec=source.my_id)
        target_key = self.CDXJ_KEY.format(rec=self.my_id)

        self.redis.zunionstore(target_key, [source_key])

        # recreate pages, if any, in new recording
        source_coll = source.get_owner()
        source_pages = source_coll.list_rec_pages(source)
        collection.import_pages(source_pages, self)

        # COPY remote archives, if any
        self.redis.sunionstore(self.RA_KEY.format(rec=self.my_id),
                               self.RA_KEY.format(rec=source.my_id))

        # COPY recording warc keys
        self.redis.sunionstore(self.REC_WARC_KEY.format(rec=self.my_id),
                               self.REC_WARC_KEY.format(rec=source.my_id))

        # sync collection cdxj, if exists
        collection.sync_coll_index(exists=True, do_async=True)

        if not errored and delete_source:
            collection = source.get_owner()
            collection.remove_recording(source, delete=True)

        return not errored