def load_archive_info_xml(self, url): self.archive_infos = {} url = os.path.expandvars(url) logging.debug('Loading XML from {0}'.format(url)) if not url: return try: stream = BlockLoader().load(url) except Exception as e: logging.debug(e) logging.debug('Proceeding without xml archive info') return root = ElementTree.fromstring(stream.read()) for link in root.findall('link'): name = link.get('id') longname = link.get('longname') archive = link.find('archive') timegate = link.find('timegate') if timegate is None or archive is None: continue rewritten = (archive.get('rewritten-urls') == 'yes') unrewritten_url = archive.get('un-rewritten-api-url', '') uri = timegate.get('uri') self.archive_infos[name] = { 'uri': uri, 'rewritten': rewritten, 'unrewritten_url': unrewritten_url, 'name': longname }
def load_archive_info_json(self, url): self.archive_infos = {} url = os.path.expandvars(url) logging.debug('Loading XML from {0}'.format(url)) if not url: return try: stream = BlockLoader().load(url) except Exception as e: logging.debug(e) logging.debug('Proceeding without json archive info') return archives = json.loads(stream.read()) for arc in archives: id_ = arc['id'] name = arc['name'] uri = arc['timegate'] base_url = arc.get('base_url', uri) unrewritten_url = arc.get('unrewritten_url') if not unrewritten_url: unrewritten_url = base_url + '{timestamp}id_/{url}' self.archive_infos[id_] = { 'id': id_, 'uri': uri, 'name': name, 'base_url': base_url, 'unrewritten_url': unrewritten_url }
def __init__(self, summary, config=None): loc = None cookie_maker = None self.max_blocks = self.DEFAULT_MAX_BLOCKS reload_ival = self.DEFAULT_RELOAD_INTERVAL if config: loc = config.get('zipnum_loc') cookie_maker = config.get('cookie_maker') self.max_blocks = config.get('max_blocks', self.max_blocks) reload_ival = config.get('reload_interval', reload_ival) if not loc: splits = os.path.splitext(summary) loc = splits[0] + '.loc' self.summary = summary self.loc_filename = loc # initial loc map self.loc_map = {} self.loc_mtime = 0 self.load_loc() # reload interval self.loc_update_time = datetime.datetime.now() self.reload_interval = datetime.timedelta(minutes=reload_ival) self.blk_loader = BlockLoader(cookie_maker=cookie_maker)
def __init__(self, summary, config=None): self.max_blocks = self.DEFAULT_MAX_BLOCKS self.loc_resolver = None self.config = config or {} loc = None cookie_maker = None reload_ival = self.DEFAULT_RELOAD_INTERVAL if config: loc = config.get('shard_index_loc') cookie_maker = config.get('cookie_maker') self.max_blocks = config.get('max_blocks', self.max_blocks) reload_ival = config.get('reload_interval', reload_ival) if isinstance(loc, dict): self.loc_resolver = LocPrefixResolver(summary, loc) else: self.loc_resolver = LocMapResolver(summary, loc) self.summary = summary # reload interval self.loc_update_time = datetime.datetime.now() self.reload_interval = datetime.timedelta(minutes=reload_ival) self.blk_loader = BlockLoader(cookie_maker=cookie_maker)
def test_s3_read_2(): pytest.importorskip('boto3') res = BlockLoader().load('s3://commoncrawl/crawl-data/CC-MAIN-2015-11/index.html') buff = res.read() assert len(buff) == 2082 reader = DecompressingBufferedReader(BytesIO(buff)) assert reader.readline() == b'<!DOCTYPE html>\n'
def __init__(self, loader=None, cookie_maker=None, block_size=BUFF_SIZE, *args, **kwargs): if not loader: loader = BlockLoader(cookie_maker=cookie_maker) self.loader = loader self.block_size = block_size super(BlockArcWarcRecordLoader, self).__init__(*args, **kwargs)
def copy_rec_files(self, user, collection, recording, warc_files): if self.dry_run: target_dirname = os.path.join('/tmp/migrate4.0', collection.my_id) else: target_dirname = user.get_user_temp_warc_path() os.makedirs(target_dirname, exist_ok=True) print('Writing to dir: ' + target_dirname) coll_warc_key = recording.COLL_WARC_KEY.format(coll=collection.my_id) rec_warc_key = recording.REC_WARC_KEY.format(rec=recording.my_id) # Copy WARCs loader = BlockLoader() total_size = 0 for n, url in warc_files.items(): if not url.startswith('s3://'): print('FILE ERR: Skipping local file: ' + url) continue local_filename = n if n != recording.INDEX_FILE_KEY else os.path.basename( url) target_file = os.path.join(target_dirname, local_filename) src = loader.load(url) try: with open(target_file, 'wb') as dest: print('Copying {0} -> {1}'.format(url, target_file)) shutil.copyfileobj(src, dest) size = dest.tell() target_file = add_local_store_prefix(target_file) if n != recording.INDEX_FILE_KEY: self.redis.hset(coll_warc_key, n, target_file) self.redis.sadd(rec_warc_key, n) total_size += size else: recording.set_prop(n, target_file, update_ts=False) if self.dry_run: os.remove(strip_prefix(target_file)) except: import traceback traceback.print_exc() # commit from temp dir to storage if not self.dry_run: recording.commit_to_storage() return total_size
def fetch_local_file(self, uri): #fh = open(uri) fh = BlockLoader().load_file_or_resource(uri) content_type, _ = mimetypes.guess_type(uri) # create fake headers for local file status_headers = StatusAndHeaders('200 OK', [('Content-Type', content_type)]) stream = fh return (status_headers, stream)
def test_s3_read_1(): pytest.importorskip('boto') res = BlockLoader().load('s3://commoncrawl/crawl-data/CC-MAIN-2015-11/segments/1424936462700.28/warc/CC-MAIN-20150226074102-00159-ip-10-28-5-156.ec2.internal.warc.gz', offset=53235662, length=2526) buff = res.read() assert len(buff) == 2526 reader = DecompressingBufferedReader(BytesIO(buff)) assert reader.readline() == b'WARC/1.0\r\n' assert reader.readline() == b'WARC-Type: response\r\n'
def __init__(self, loader=None, cookie_maker=None, block_size=8192): if not loader: loader = BlockLoader(cookie_maker) self.loader = loader self.block_size = block_size self.arc_parser = ARCHeadersParser(self.ARC_HEADERS) self.warc_parser = StatusAndHeadersParser(self.WARC_TYPES) self.http_parser = StatusAndHeadersParser(self.HTTP_TYPES) self.http_req_parser = StatusAndHeadersParser(self.HTTP_VERBS)
def get_checksum_and_size(self, filepath_or_url): """Returns the checksum of the supplied URL or filepath and the size of the resource :param str filepath_or_url: The URL or filepath to the resource that the checksum and size is desired for :return: A three tuple containing the kind of checksum, the checksum itself, and size :rtype: tuple[str|None, str|None, int|None] """ m = hashlib.md5() amount = 1024 * 1024 total_size = 0 with closing(BlockLoader().load(filepath_or_url)) as f: while True: chunk = f.read(amount) chunk_size = len(chunk) if chunk_size == 0: break total_size += chunk_size m.update(chunk) return 'md5', m.hexdigest(), total_size
def __init__(self, loader=None, cookie_maker=None, block_size=8192, verify_http=True, arc2warc=True): if not loader: loader = BlockLoader(cookie_maker=cookie_maker) self.loader = loader self.block_size = block_size if arc2warc: self.arc_parser = ARC2WARCHeadersParser() else: self.arc_parser = ARCHeadersParser() self.warc_parser = StatusAndHeadersParser(self.WARC_TYPES) self.http_parser = StatusAndHeadersParser(self.HTTP_TYPES, verify_http) self.http_req_parser = StatusAndHeadersParser(self.HTTP_VERBS, verify_http)
def wasapi_download(self, username, coll_name, filename): user = self._get_wasapi_user(username) if not user: self._raise_error(404, 'no_such_user') collection = user.get_collection_by_name(coll_name) if not collection: self._raise_error(404, 'no_such_collection') # self.access.assert_is_curr_user(user) # only users with write access can use wasapi self.access.assert_can_write_coll(collection) warc_key = collection.get_warc_key() warc_path = self.redis.hget(warc_key, filename) if not warc_path: self._raise_error(404, 'file_not_found') response.headers['Content-Type'] = 'application/octet-stream' response.headers[ 'Content-Disposition'] = "attachment; filename*=UTF-8''" + filename response.headers['Transfer-Encoding'] = 'chunked' loader = BlockLoader() fh = None try: fh = loader.load(warc_path) except Exception: self._raise_error(400, 'file_load_error') def read_all(fh): for chunk in StreamIter(fh): yield chunk return read_all(fh)
def __init__(self, static_path): mimetypes.init() self.static_path = static_path self.block_loader = BlockLoader()
def test_mock_webhdfs_load_2(): expected = 'http://remote-host/webhdfs/v1/some/file.warc.gz?op=OPEN&offset=10' with patch('pywb.utils.loaders.HttpLoader.load', mock_load(expected)): res = BlockLoader().load('webhdfs://remote-host/some/file.warc.gz', 10, -1)
def test_mock_webhdfs_load_3_username(): os.environ['WEBHDFS_USER'] = '******' expected = 'http://remote-host/webhdfs/v1/some/file.warc.gz?op=OPEN&offset=10&user.name=someuser' with patch('pywb.utils.loaders.HttpLoader.load', mock_load(expected)): res = BlockLoader().load('webhdfs://remote-host/some/file.warc.gz', 10, -1)
def handle_download(self, user, coll_name, recs): user, collection = self.user_manager.get_user_coll(user, coll_name) if not collection: self._raise_error(404, 'no_such_collection') if not self.access.is_superuser(): self.access.assert_can_write_coll(collection) # collection['uid'] = coll collection.load() Stats(self.redis).incr_download(collection) now = timestamp_now() name = coll_name if recs != '*': rec_list = recs.split(',') if len(rec_list) == 1: name = recs else: name += '-' + recs else: rec_list = None filename = self.download_filename.format(title=quote(name), timestamp=now) loader = BlockLoader() coll_info = self.create_coll_warcinfo(user, collection, filename) def iter_infos(): for recording in collection.get_recordings(load=True): if rec_list and recording.name not in rec_list: continue warcinfo = self.create_rec_warcinfo(user, collection, recording, filename) size = len(warcinfo) size += recording.size yield recording, warcinfo, size def read_all(infos): yield coll_info for recording, warcinfo, _ in infos: yield warcinfo for n, warc_path in recording.iter_all_files(): try: fh = loader.load(warc_path) except Exception: print('Skipping invalid ' + warc_path) continue for chunk in StreamIter(fh): yield chunk response.headers['Content-Type'] = 'application/octet-stream' response.headers[ 'Content-Disposition'] = "attachment; filename*=UTF-8''" + filename # if not transfer-encoding, store infos and calculate total size if not self.download_chunk_encoded: size = len(coll_info) infos = list(iter_infos()) size += sum(size for r, i, size in infos) response.headers['Content-Length'] = size return read_all(infos) else: # stream everything response.headers['Transfer-Encoding'] = 'chunked' return read_all(iter_infos())
def test_mock_webhdfs_load_4_token(): os.environ['WEBHDFS_USER'] = '' os.environ['WEBHDFS_TOKEN'] = 'ATOKEN' expected = 'http://remote-host/webhdfs/v1/some/file.warc.gz?op=OPEN&offset=10&delegation=ATOKEN' with patch('pywb.utils.loaders.HttpLoader.load', mock_load(expected)): res = BlockLoader().load('webhdfs://remote-host/some/file.warc.gz', 10, -1)
def handle_download(self, user, coll, rec): collection = self.manager.get_collection(user, coll, rec) if not collection: self._raise_error(404, 'Collection not found', id=coll) now = timestamp_now() name = collection['id'] if rec != '*': rec_list = rec.split(',') if len(rec_list) == 1: name = rec else: name += '-' + rec else: rec_list = None filename = self.download_filename.format(title=quote(name), timestamp=now) loader = BlockLoader() coll_info = self.create_coll_warcinfo(user, collection, filename) def iter_infos(): for recording in collection['recordings']: if rec_list and recording['id'] not in rec_list: continue warcinfo = self.create_rec_warcinfo(user, collection, recording, filename) size = len(warcinfo) size += recording['size'] yield recording, warcinfo, size def read_all(infos): yield coll_info for recording, warcinfo, _ in infos: yield warcinfo for warc_path in self._iter_all_warcs(user, coll, recording['id']): try: fh = loader.load(warc_path) except: print('Skipping invalid ' + warc_path) continue for chunk in StreamIter(fh): yield chunk response.headers['Content-Type'] = 'application/octet-stream' response.headers[ 'Content-Disposition'] = "attachment; filename*=UTF-8''" + filename # if not transfer-encoding, store infos and calculate total size if not self.download_chunk_encoded: size = len(coll_info) infos = list(iter_infos()) size += sum(size for r, i, size in infos) response.headers['Content-Length'] = size return read_all(infos) else: # stream everything response.headers['Transfer-Encoding'] = 'chunked' return read_all(iter_infos())
def test_err_unknown_loader(): # unknown loader error with pytest.raises(IOError): BlockLoader().load('foo://example.com', 10).read()
def test_err_no_such_file(): # no such file with pytest.raises(IOError): len(BlockLoader().load('_x_no_such_file_', 0, 100).read('400'))
def copy_data_from_recording(self, source, delete_source=False): """Copy given recording building block entries. :param RedisUniqueComponent source: building block :param bool delete_source: whether to delete source building block :returns: whether successful or not :rtype: bool """ if self == source: return False if not self.is_open(): return False errored = False self._copy_prop(source, 'title') self._copy_prop(source, 'desc') self._copy_prop(source, 'rec_type') self._copy_prop(source, 'recorded_at') #self._copy_prop(source, 'patch_rec') collection = self.get_owner() user = collection.get_owner() target_dirname = user.get_user_temp_warc_path() target_warc_key = self.COLL_WARC_KEY.format(coll=collection.my_id) # Copy WARCs loader = BlockLoader() for n, url in source.iter_all_files(include_index=True): local_filename = n + '.' + timestamp20_now() target_file = os.path.join(target_dirname, local_filename) src = loader.load(url) try: with open(target_file, 'wb') as dest: print('Copying {0} -> {1}'.format(url, target_file)) shutil.copyfileobj(src, dest) size = dest.tell() if n != self.INDEX_FILE_KEY: self.incr_size(size) self.redis.hset(target_warc_key, n, add_local_store_prefix(target_file)) else: self.set_prop(n, target_file) except: import traceback traceback.print_exc() errored = True # COPY cdxj, if exists source_key = self.CDXJ_KEY.format(rec=source.my_id) target_key = self.CDXJ_KEY.format(rec=self.my_id) self.redis.zunionstore(target_key, [source_key]) # recreate pages, if any, in new recording source_coll = source.get_owner() source_pages = source_coll.list_rec_pages(source) collection.import_pages(source_pages, self) # COPY remote archives, if any self.redis.sunionstore(self.RA_KEY.format(rec=self.my_id), self.RA_KEY.format(rec=source.my_id)) # COPY recording warc keys self.redis.sunionstore(self.REC_WARC_KEY.format(rec=self.my_id), self.REC_WARC_KEY.format(rec=source.my_id)) # sync collection cdxj, if exists collection.sync_coll_index(exists=True, do_async=True) if not errored and delete_source: collection = source.get_owner() collection.remove_recording(source, delete=True) return not errored