def create_bookmark(self, props, incr_stats=True): collection = self.get_owner() self.access.assert_can_write_coll(collection) # don't store rec id, if provided props.pop('rec', '') # if a page is specified for this bookmark, ensure that it has the same url and timestamp page_id = props.get('page_id') if page_id: if not collection.page_exists(page_id): return None bid = self.get_new_bookmark_id() props['id'] = bid bookmark = props self.bookmark_order.insert_ordered_id(bid, props.get('before_id')) self.redis.hset(self.BOOK_CONTENT_KEY.format(blist=self.my_id), bid, json.dumps(bookmark)) if incr_stats: Stats(self.redis).incr_bookmark_add() if page_id: collection.clear_page_bookmark_cache() self.load_pages([bookmark]) return bookmark
def add_bookmarks(self, bookmarks): collection = self.get_owner() self.access.assert_can_write_coll(collection) all_bookmarks = {} clear_cache = False for bookmark_data in bookmarks: # don't store rec id, if provided bookmark_data.pop('rec', '') # if a page is specified for this bookmark, ensure that it has the same url and timestamp page_id = bookmark_data.get('page_id') if page_id: clear_cache = True bid = self.get_new_bookmark_id() bookmark_data['id'] = bid all_bookmarks[bid] = json.dumps(bookmark_data) self.bookmark_order.insert_ordered_ids(all_bookmarks.keys()) self.redis.hmset(self.BOOK_CONTENT_KEY.format(blist=self.my_id), all_bookmarks) if clear_cache: self.get_owner().clear_page_bookmark_cache() Stats(self.redis).incr_bookmark_add(len(bookmarks)) self.mark_updated()
def upload_file(): user = self.access.session_user force_coll_name = request.query.getunicode('force-coll', '') if force_coll_name: collection = user.get_collection_by_name(force_coll_name) else: collection = None # allow uploading to external collections if not collection or not collection.is_external(): if user.is_anon(): return self._raise_error(400, 'not_logged_in') expected_size = int(request.headers['Content-Length']) if not expected_size: return self._raise_error(400, 'no_file_specified') filename = request.query.getunicode('filename') stream = request.environ['wsgi.input'] res = self.uploader.upload_file(user, stream, expected_size, filename, force_coll_name) if 'error' in res: return self._raise_error(400, res['error']) Stats(self.redis).incr_upload(user, expected_size) return res
def __init__(self, *args, **kwargs): super(WebsockController, self).__init__(*args, **kwargs) config = kwargs['config'] self.status_update_secs = float(config['status_update_secs']) self.browser_mgr = kwargs['browser_mgr'] self.content_app = kwargs['content_app'] self.dyn_stats = DynStats(self.redis, config) self.stats = Stats(self.redis)
def _add_stats(self, cdx, resp_headers, kwargs, record): type_ = kwargs['type'] if type_ == 'replay-coll': content_len = record.rec_headers.get_header('Content-Length') if content_len is not None: Stats(self.redis).incr_replay(int(content_len), kwargs['user']) if type_ in ('record', 'live'): return source = cdx.get('source') if not source: return if source == 'local': source = 'replay' if source == 'replay' and type_ == 'patch': return orig_source = cdx.get('orig_source_id') if orig_source: source = orig_source ra_rec = None ra_recording = None # set source in recording-key if type_ in self.MODIFY_MODES: skip = resp_headers.get('Recorder-Skip') if not skip and source not in ('live', 'replay'): ra_rec = unquote(resp_headers.get('Recorder-Rec', '')) ra_rec = ra_rec or kwargs['rec'] recording = kwargs.get('recording') patch_recording = kwargs.get('patch_recording') if recording and ra_rec == recording.my_id: ra_recording = recording elif patch_recording and ra_rec == patch_recording.my_id: ra_recording = patch_recording url = cdx.get('url') referrer = request.environ.get('HTTP_REFERER') if not referrer: referrer = url elif ('wsgiprox.proxy_host' not in request.environ and request.environ.get('HTTP_HOST') in referrer): referrer = url self.dyn_stats.update_dyn_stats(url, kwargs, referrer, source, ra_recording)
def delete_me(self, storage, pages=True): res = self.delete_files(storage) Stats(self.redis).incr_delete(self) # if deleting collection, no need to remove pages for each recording # they'll be deleted with the collection if pages: self.get_owner().delete_rec_pages(self) if not self.delete_object(): res['error'] = 'not_found' return res
def remove_bookmark(self, bid): self.access.assert_can_write_coll(self.get_owner()) res = self.bookmark_order.remove_ordered_id(bid) if not res: return False # check if bookmark had a page_id bookmark = self.get_bookmark(bid) page_id = bookmark.get('page_id') if page_id: self.get_owner().remove_page_bookmark(page_id, bid) if self.redis.hdel(self.BOOK_CONTENT_KEY.format(blist=self.my_id), bid) == 1: Stats(self.redis).incr_bookmark_del() return True else: return False
def update_bookmark(self, bid, props): self.access.assert_can_write_coll(self.get_owner()) bookmark = self.get_bookmark(bid) if not bookmark: return False AVAIL_PROPS = ('title', 'url', 'timestamp', 'browser', 'desc') for prop in props: if prop in AVAIL_PROPS: bookmark[prop] = props[prop] self.redis.hset(self.BOOK_CONTENT_KEY.format(blist=self.my_id), bid, json.dumps(bookmark)) Stats(self.redis).incr_bookmark_mod() return bookmark
def move(self, collection, new_name, new_user): if self == new_user: return False new_name = new_user.colls.reserve_obj_name(new_name, allow_dupe=False) if not self.colls.remove_object(collection): return False new_user.colls.add_object(new_name, collection, owner=True) self.incr_size(-collection.size) new_user.incr_size(collection.size) Stats(self.redis).move_temp_to_user_usage(collection) for recording in collection.get_recordings(): # will be marked for commit recording.set_closed() return True
def delete_me(self, storage, pages=True): """Delete recording. :param BaseStorage storage: Webrecorder storage :param bool pages: whether to delete pages :returns: result :rtype: dict """ res = self.delete_files(storage) Stats(self.redis).incr_delete(self) # if deleting collection, no need to remove pages for each recording # they'll be deleted with the collection if pages: self.get_owner().delete_rec_pages(self) if not self.delete_object(): res['error'] = 'not_found' return res
def upload_file(): if self.access.session_user.is_anon(): return self._raise_error(400, 'not_logged_in') expected_size = int(request.headers['Content-Length']) if not expected_size: return self._raise_error(400, 'no_file_specified') force_coll_name = request.query.getunicode('force-coll', '') filename = request.query.getunicode('filename') stream = request.environ['wsgi.input'] user = self.access.session_user res = self.uploader.upload_file(user, stream, expected_size, filename, force_coll_name) if 'error' in res: return self._raise_error(400, res['error']) Stats(self.redis).incr_upload(user, expected_size) return res
def handle_download(self, user, coll_name, recs): user, collection = self.user_manager.get_user_coll(user, coll_name) if not collection: self._raise_error(404, 'no_such_collection') if not self.access.is_superuser(): self.access.assert_can_write_coll(collection) # collection['uid'] = coll collection.load() Stats(self.redis).incr_download(collection) now = timestamp_now() name = coll_name if recs != '*': rec_list = recs.split(',') if len(rec_list) == 1: name = recs else: name += '-' + recs else: rec_list = None filename = self.download_filename.format(title=quote(name), timestamp=now) loader = BlockLoader() coll_info = self.create_coll_warcinfo(user, collection, filename) def iter_infos(): for recording in collection.get_recordings(load=True): if rec_list and recording.name not in rec_list: continue warcinfo = self.create_rec_warcinfo(user, collection, recording, filename) size = len(warcinfo) size += recording.size yield recording, warcinfo, size def read_all(infos): yield coll_info for recording, warcinfo, _ in infos: yield warcinfo for n, warc_path in recording.iter_all_files(): try: fh = loader.load(warc_path) except Exception: print('Skipping invalid ' + warc_path) continue for chunk in StreamIter(fh): yield chunk response.headers['Content-Type'] = 'application/octet-stream' response.headers[ 'Content-Disposition'] = "attachment; filename*=UTF-8''" + filename # if not transfer-encoding, store infos and calculate total size if not self.download_chunk_encoded: size = len(coll_info) infos = list(iter_infos()) size += sum(size for r, i, size in infos) response.headers['Content-Length'] = size return read_all(infos) else: # stream everything response.headers['Transfer-Encoding'] = 'chunked' return read_all(iter_infos())
def create_browser(): """ Api to launch remote browser instances """ sesh = self.get_session() if sesh.is_new() and self.is_content_request(): self._raise_error(403, 'invalid_browser_request') browser_id = request.query['browser'] Stats(self.redis).incr_browser(browser_id) user = self.get_user(redir_check=False) data = request.query coll_name = data.getunicode('coll', '') rec = data.get('rec', '') mode = data.get('mode', '') url = data.getunicode('url', '') timestamp = data.get('timestamp', '') sources = '' inv_sources = '' patch_rec = '' collection = user.get_collection_by_name(coll_name) recording = collection.get_recording(rec) if not collection: self._raise_error(404, 'no_such_collection') if mode == 'extract': # Extract from All, Patch from None sources = '*' inv_sources = '*' elif mode.startswith('extract:'): # Extract from One, Patch from all but one sources = mode.split(':', 1)[1] inv_sources = sources # load patch recording also #patch_recording = collection.get_recording(recording['patch_rec']) if recording: patch_rec = recording.get_prop('patch_rec') mode = 'extract' elif mode.startswith('extract_only:'): # Extract from one only, no patching sources = mode.split(':', 1)[1] inv_sources = '*' mode = 'extract' if mode in self.MODIFY_MODES: if not recording: return self._raise_error(404, 'no_such_recording') #rec = recording.my_id elif mode in ('replay', 'replay-coll'): rec = '*' else: return self._raise_error(400, 'invalid_mode') browser_can_write = '1' if self.access.can_write_coll(collection) else '0' remote_ip = self._get_remote_ip() # build kwargs kwargs = dict(user=user.name, id=sesh.get_id(), coll=collection.my_id, rec=rec, coll_name=quote(coll_name), #rec_name=quote(rec_name, safe='/*'), type=mode, sources=sources, inv_sources=inv_sources, patch_rec=patch_rec, remote_ip=remote_ip, ip=remote_ip, browser=browser_id, url=url, request_ts=timestamp, browser_can_write=browser_can_write) data = self.browser_mgr.request_new_browser(kwargs) if 'error_message' in data: self._raise_error(400, data['error_message']) return data
def handle_download_name(self, user, coll_name, warc_name, url): #username = request.query.getunicode('user') #warc_name = request.query.getunicode('doi') # some clients use collection rather than coll_name so we must check for both #coll_name = request.query.getunicode('collection') #user = self._get_wasapi_user() #self.access.assert_is_curr_user(user) #colls = None #if coll_name: # collection = user.get_collection_by_name(coll_name) # if collection: # colls = [collection] # else: # self._raise_error(404, 'no_such_collection') #else: # colls = user.get_collections() #files = [] user_name = user user = self.user_manager.get_user(user) collection = user.get_collection_by_name(coll_name) if not collection: self._raise_error(404, 'no_such_collection') self.access.assert_can_write_coll(collection) # collection['uid'] = coll collection.load() Stats(self.redis).incr_download(collection) download_path = self.get_origin() + "/api/v1/download/{}/".format( user_name) warc_name_broke = warc_name.replace("/", "\/") warc_name_broke = warc_name.replace("10.25354/", "") local_storage = LocalFileStorage(self.redis) landingpage = template( 'webrecorder/templates/landingpage.html', title=coll_name, warc_file= 'https://projects.zo.uni-heidelberg.de/webarchive/warc/10.25354/' + warc_name_broke + '.warc', url=url) try: os.makedirs( os.path.join(os.environ['STORAGE_REPLAY'], 'lp', '10.25354')) print("Directory '% s' created" % os.path.isfile( os.path.join(os.environ['STORAGE_REPLAY'], 'lp', '10.25354'))) except FileExistsError: print("Directory '% s' already created!" % os.path.join(os.environ['STORAGE_REPLAY'], 'lp', '10.25354')) except FileNotFoundError: print("Directory '% s' No such file or directory!" % os.path.join(os.environ['STORAGE_REPLAY'], 'lp', '10.25354')) try: os.makedirs( os.path.join(os.environ['STORAGE_REPLAY'], 'warc', '10.25354')) print("Directory '% s' created" % os.path.isfile( os.path.join(os.environ['STORAGE_REPLAY'], 'warc', '10.25354'))) except FileExistsError: print( "Directory '% s' already created!" % os.path.join(os.environ['STORAGE_REPLAY'], 'warc', '10.25354')) except FileNotFoundError: print( "Directory '% s' No such file or directory!" % os.path.join(os.environ['STORAGE_REPLAY'], 'warc', '10.25354')) try: f = open( os.path.join(os.environ['STORAGE_REPLAY'], 'lp', '10.25354', warc_name_broke) + ".html", 'w') f.write(landingpage) f.close() except FileExistsError: print( os.path.join(os.environ['STORAGE_REPLAY'], 'lp', '10.25354', warc_name_broke) + ".html exists") except FileNotFoundError: print( os.path.join(os.environ['STORAGE_REPLAY'], 'lp', '10.25354', warc_name_broke) + ".html doesn't exists") commit_storage = collection.get_storage() for recording in collection.get_recordings(): is_committed = recording.is_fully_committed() is_open = not is_committed and recording.get_pending_count() > 0 storage = commit_storage if is_committed else local_storage try: f = open( os.path.join(os.environ['STORAGE_REPLAY'], 'warc', '10.25354', warc_name_broke) + ".warc", 'wb') writer = WARCWriter(f, gzip=True) for name, path in recording.iter_all_files( include_index=False): local_download = download_path.format(user=user.name, coll=collection.name, filename=name) warc_key = collection.get_warc_key() warc_path = self.redis.hget(warc_key, name) if 'http://nginx:6090' in warc_path: warc_path = warc_path.replace('http://nginx:6090', '') if 'https://nginx:6090' in warc_path: warc_path = warc_path.replace('https://nginx:6090', '') if not warc_path: self._raise_error(404, 'file_not_found') with open(warc_path, 'rb') as stream: for record in ArchiveIterator(stream): writer.write_record(record) f.close() except FileExistsError: print( os.path.join(os.environ['STORAGE_REPLAY'], 'warc', '10.25354', warc_name_broke) + ".warc exists") except FileNotFoundError: print( os.path.join(os.environ['STORAGE_REPLAY'], 'warc', '10.25354', warc_name_broke) + ".warc doesn't exists")