def _init_upload_status(self, user, total_size, num_files, filename=None, expire=None): """Initialize upload status. :param User user: user :param int total_size: size of WARC archive :param int num_files: n.s. :param filename: WARC archive filename :type: str or None :param expire: upload TTL :type: int or None :returns: upload ID and upload Redis key :rtype: str and str """ upload_id = self._get_upload_id() upload_key = self.UPLOAD_KEY.format(user=user.name, upid=upload_id) with redis_pipeline(self.redis) as pi: pi.hset(upload_key, 'size', 0) pi.hset(upload_key, 'total_size', total_size * 2) pi.hset(upload_key, 'total_files', num_files) pi.hset(upload_key, 'files', num_files) if filename: pi.hset(upload_key, 'filename', filename) if expire: pi.expire(upload_key, expire) return upload_id, upload_key
def update_dyn_stats(self, url, params, referrer, source, ra_recording): if referrer.endswith('.css'): css_res = self._res_url_templ(self.dyn_ref_templ, params, referrer) orig_referrer = self.redis.get(css_res) if orig_referrer: referrer = orig_referrer dyn_stats_key = self._res_url_templ(self.dyn_stats_key_templ, params, referrer) curr_url_key = self._res_url_templ(self.dyn_stats_key_templ, params, url) with redis_pipeline(self.redis) as pi: pi.delete(curr_url_key) pi.hincrby(dyn_stats_key, source, 1) pi.expire(dyn_stats_key, self.dyn_stats_secs) if url.endswith('.css'): css_res = self._res_url_templ(self.dyn_ref_templ, params, url) pi.setex(css_res, self.dyn_stats_secs, referrer) if ra_recording: ra_recording.track_remote_archive(pi, source)
def incr_record(self, params, size, cdx_list): username = params.get('param.user') if not username: return today = today_str() with redis_pipeline(self.redis) as pi: # rate limiting rate_limit_key = self.get_rate_limit_key(params) if rate_limit_key: pi.incrby(rate_limit_key, size) pi.expire(rate_limit_key, self.RATE_LIMIT_TTL) # write size to usage hashes if username.startswith(self.TEMP_PREFIX): key = self.ALL_CAPTURE_TEMP_KEY else: key = self.ALL_CAPTURE_USER_KEY if key: pi.hincrby(key, today, size) is_extract = params.get('sources') != None is_patch = params.get('param.recorder.rec') != None if is_extract or is_patch: with redis_pipeline(self.redis) as pi: for cdx in cdx_list: try: cdx = CDXObject(cdx) source_id = cdx['orig_source_id'] size = int(cdx['length']) if source_id and size: pi.hincrby(self.SOURCES_KEY.format(source_id), today, size) except Exception as e: pass if is_patch: if username.startswith(self.TEMP_PREFIX): key = self.PATCH_TEMP_KEY else: key = self.PATCH_USER_KEY pi.hincrby(key, today, size)
def move_temp_to_user_usage(self, collection): today = today_str() date_str = collection.get_created_iso_date() size = collection.size with redis_pipeline(self.redis) as pi: pi.hincrby(self.TEMP_MOVE_COUNT_KEY, today, 1) pi.hincrby(self.TEMP_MOVE_SIZE_KEY, today, size) pi.hincrby(self.ALL_CAPTURE_USER_KEY, date_str, size) pi.hincrby(self.ALL_CAPTURE_TEMP_KEY, date_str, -size)
def inc_pending_count(self): """Increase outstanding CDX index lines.""" if not self.is_open(extend=False): return pending_count = self.PENDING_COUNT_KEY.format(rec=self.my_id) with redis_pipeline(self.redis) as pi: pi.incrby(pending_count, 1) pi.expire(pending_count, self.PENDING_TTL)
def __setitem__(self, name, obj): if not isinstance(obj, dict): raise Exception('Must assign a dict') user = self.make_user(name) user.access.assert_is_curr_user(user) user.data.update(obj) with redis_pipeline(self.redis) as pi: user.commit(pi) pi.sadd(self.users_key, name)
def multifile_upload(self, user, files): total_size = 0 for filename in files: total_size += os.path.getsize(filename) upload_id = self._get_upload_id() upload_key = self.upload_key.format(user=user, upid=upload_id) with redis_pipeline(self.manager.redis) as pi: pi.hset(upload_key, 'size', 0) pi.hset(upload_key, 'total_size', total_size * 2) pi.hset(upload_key, 'total_files', len(files)) pi.hset(upload_key, 'files', len(files)) pi.expire(upload_key, 120) gevent.sleep(0) for filename in files: size = 0 fh = None try: size = os.path.getsize(filename) fh = open(filename, 'rb') self.manager.redis.hset(upload_key, 'filename', filename) stream = SizeTrackingReader(fh, size, self.manager.redis, upload_key) if filename.endswith('.har'): stream, expected_size = self.har2warc(filename, stream) fh.close() fh = stream atexit.register(lambda: os.remove(stream.name)) infos = self.parse_uploaded(stream, size) res = self.handle_upload(fh, upload_id, upload_key, infos, filename, user, False, size) assert ('error_message' not in res) except Exception as e: traceback.print_exc() print('ERROR PARSING: ' + filename) print(e) if fh: rem = size - fh.tell() if rem > 0: self.manager.redis.hincrby(upload_key, 'size', rem) self.manager.redis.hincrby(upload_key, 'files', -1) fh.close()
def inc_pending_size(self, size): """Increase outstanding size. :param int size: size """ if not self.is_open(extend=False): return pending_size = self.PENDING_SIZE_KEY.format(rec=self.my_id) with redis_pipeline(self.redis) as pi: pi.incrby(pending_size, size) pi.expire(pending_size, self.PENDING_TTL)
def inc_pending_size(self, size): """Increase outstanding size. :param int size: size """ if not self.is_open(extend=False): return pending_size = self.PENDING_SIZE_KEY.format(rec=self.my_id) with redis_pipeline(self.redis) as pi: pi.incrby(pending_size, size) pi.expire(pending_size, self.PENDING_TTL)
def handle_upload(self, stream, upload_id, upload_key, infos, filename, user, force_coll_name, total_size): """Operate WARC archive upload. :param stream: file object :param str upload_id: upload ID :param str upload_key: upload Redis key :param list infos: list of recordings :param str filename: WARC archive filename :param user User: user :param str force_coll_name: name of collection to upload into :param int total_size: size of WARC archive :returns: upload information :rtype: dict """ logger.debug('Begin handle_upload() from: ' + filename + ' force_coll_name: ' + str(force_coll_name)) num_recs = 0 num_recs = len(infos) # first info is for collection, not recording if num_recs >= 2: num_recs -= 1 logger.debug('Parsed {0} recordings, Buffer Size {1}'.format(num_recs, total_size)) first_coll, rec_infos = self.process_upload(user, force_coll_name, infos, stream, filename, total_size, num_recs) if not rec_infos: print('NO ARCHIVES!') #stream.close() return {'error': 'no_archive_data'} with redis_pipeline(self.redis) as pi: pi.hset(upload_key, 'coll', first_coll.name) pi.hset(upload_key, 'coll_title', first_coll.get_prop('title')) pi.hset(upload_key, 'filename', filename) pi.expire(upload_key, self.upload_exp) self.launch_upload(self.run_upload, upload_key, filename, stream, user, rec_infos, total_size, first_coll) return {'upload_id': upload_id, 'user': user.name }
def _create_anon_user(self, user): max_size = self.redis.hget('h:defaults', 'max_anon_size') if not max_size: max_size = self.default_max_anon_size key = self.user_key.format(user=user) now = int(time.time()) with redis_pipeline(self.redis) as pi: pi.hset(key, 'max_size', max_size) pi.hset(key, 'max_coll', 1) pi.hset(key, 'created_at', now) pi.hsetnx(key, 'size', '0')
def _update_redis_and_cookie(self, set_cookie, session, headers): duration = self.durations[session.dura_type]['total'] if session.should_save: with redis_pipeline(self.redis) as pi: data = base64.b64encode(pickle.dumps(session._sesh)) ttl = session.ttl # PERMA CUSTOMIZATION: changed from < to <= # https://github.com/webrecorder/webrecorder/pull/721 if ttl <= 0: ttl = duration pi.setex(session.key, ttl, data) if set_cookie: self.track_long_term(session, pi) # set redis duration if session.curr_role != 'anon': pi.expire(session.key, duration) elif set_cookie and session.curr_role != 'anon': # extend redis duration if extending cookie! self.redis.expire(session.key, duration) if not set_cookie: return expires = datetime.utcnow() + timedelta(seconds=duration) # set cookie sesh_cookie = session.get_cookie() value = '{0}={1}; Path=/; HttpOnly' # add max-age only if: # - long duration session # - anonymous session (not restricted) # don't set for restricted session, as cookie only valid as long as top session exists if session.dura_type == 'long' or session.curr_role == 'anon': value += '; max-age={3}' value = value.format(self.sesh_key, sesh_cookie, datetime_to_http_date(expires), duration) scheme = session.environ.get('wsgi.url_scheme', '') if scheme.lower() == 'https': value += '; Secure' headers.append(('Set-Cookie', value))
def run_upload(self, upload_key, filename, stream, user, rec_infos, total_size): try: count = 0 num_recs = len(rec_infos) last_end = 0 for info in rec_infos: count += 1 logger.debug('Id: {0}, Uploading Rec {1} of {2}'.format(upload_key, count, num_recs)) if info['length'] > 0: self.do_upload(upload_key, filename, stream, user, info['coll'], info['rec'], info['offset'], info['length']) else: logger.debug('SKIP upload for zero-length recording') pages = info.get('pages') if pages is None: pages = self.detect_pages(user, info['coll'], info['rec']) if pages: self.manager.import_pages(user, info['coll'], info['rec'], pages) diff = info['offset'] - last_end last_end = info['offset'] + info['length'] if diff > 0: self._add_split_padding(diff, upload_key) except: import traceback traceback.print_exc() finally: # add remainder of file, assumed consumed/skipped, if any last_end = stream.tell() stream.close() if last_end < total_size: diff = total_size - last_end self._add_split_padding(diff, upload_key) with redis_pipeline(self.manager.redis) as pi: pi.hincrby(upload_key, 'files', -1) pi.hset(upload_key, 'done', 1)
def handle_upload(self, stream, upload_id, upload_key, infos, filename, user, force_coll_name, total_size): """Operate WARC archive upload. :param stream: file object :param str upload_id: upload ID :param str upload_key: upload Redis key :param list infos: list of recordings :param str filename: WARC archive filename :param user User: user :param str force_coll_name: name of collection to upload into :param int total_size: size of WARC archive :returns: upload information :rtype: dict """ logger.debug('Begin handle_upload() from: ' + filename + ' force_coll_name: ' + str(force_coll_name)) num_recs = 0 num_recs = len(infos) # first info is for collection, not recording if num_recs >= 2: num_recs -= 1 logger.debug('Parsed {0} recordings, Buffer Size {1}'.format( num_recs, total_size)) first_coll, rec_infos = self.process_upload(user, force_coll_name, infos, stream, filename, total_size, num_recs) if not rec_infos: print('NO ARCHIVES!') #stream.close() return {'error': 'no_archive_data'} with redis_pipeline(self.redis) as pi: pi.hset(upload_key, 'coll', first_coll.name) pi.hset(upload_key, 'coll_title', first_coll.get_prop('title')) pi.hset(upload_key, 'filename', filename) pi.expire(upload_key, self.upload_exp) self.launch_upload(self.run_upload, upload_key, filename, stream, user, rec_infos, total_size, first_coll) return {'upload_id': upload_id, 'user': user.name}
def create_recording(self, user, coll, rec, rec_title, coll_title='', no_dupe=False, rec_type=None, ra_list=None): self.assert_can_write(user, coll) orig_rec = rec orig_rec_title = rec_title count = 1 rec_list_key = self.rec_list_key.format(user=user, coll=coll) while True: key = self.rec_info_key.format(user=user, coll=coll, rec=rec) if self.redis.hsetnx(key, 'id', rec) == 1: break # don't create a duplicate, just use the specified recording if no_dupe: return self.get_recording(user, coll, rec) count += 1 rec_title = orig_rec_title + ' ' + str(count) rec = orig_rec + '-' + str(count) now = int(time.time()) if ra_list: ra_key = self.ra_key.format(user=user, coll=coll, rec=rec) with redis_pipeline(self.redis) as pi: pi.hset(key, 'title', rec_title) pi.hset(key, 'created_at', now) pi.hset(key, 'updated_at', now) pi.hsetnx(key, 'size', '0') if rec_type: pi.hset(key, 'rec_type', rec_type) pi.sadd(rec_list_key, rec) if ra_list: pi.sadd(ra_key, *ra_list) if not self._has_collection_no_access_check(user, coll): coll_title = coll_title or coll self.create_collection(user, coll, coll_title) return self.get_recording(user, coll, rec)
def add_urls_to_index(self, stream, params, filename, length): upload_key = params.get('param.upid') if upload_key: stream = SizeTrackingReader(stream, length, self.redis, upload_key) params['writer_cls'] = CDXJIndexer cdx_list = (super(WebRecRedisIndexer, self). add_urls_to_index(stream, params, filename, length)) # if replay key exists, add to it as well! coll_cdxj_key = res_template(self.coll_cdxj_key, params) if self.redis.exists(coll_cdxj_key): for cdx in cdx_list: if cdx: self.redis.zadd(coll_cdxj_key, 0, cdx) ts = datetime.now().date().isoformat() ts_sec = str(int(time.time())) with redis_pipeline(self.redis) as pi: for key_templ in self.size_keys: key = res_template(key_templ, params) pi.hincrby(key, 'size', length) if key_templ == self.rec_info_key_templ and cdx_list: pi.hset(key, 'updated_at', ts_sec) # write size to usage hashes if 'param.user' in params: if params['param.user'].startswith(self.temp_prefix): key = self.temp_usage_key # rate limiting rate_limit_key = self.get_rate_limit_key(params) if rate_limit_key: pi.incrby(rate_limit_key, length) pi.expire(rate_limit_key, self.rate_limit_ttl) else: key = self.user_usage_key if key: pi.hincrby(key, ts, length) return cdx_list
def add_mount(self, user, coll, rec, rec_title, mount_type, mount_desc, mount_config): rec_info = self.create_recording(user, coll, rec, rec_title) rec = rec_info['id'] mount_key = self.mount_key.format(user=user, coll=coll, rec=rec) rec_key = self.rec_info_key.format(user=user, coll=coll, rec=rec) with redis_pipeline(self.redis) as pi: pi.set(mount_key, mount_config) pi.hset(rec_key, 'mount_type', mount_type) if mount_desc: pi.hset(rec_key, 'mount_desc', mount_desc) return rec_info
def set_recording_timestamps(self, user, coll, rec, created_at, updated_at): self.assert_can_write(user, coll) key = self.rec_info_key.format(user=user, coll=coll, rec=rec) with redis_pipeline(self.redis) as pi: if not pi.exists(key): return False if created_at: pi.hset(key, 'created_at', created_at) if updated_at: pi.hset(key, 'updated_at', updated_at) return True
def dec_pending_count_and_size(self, size): """Decrease outstanding CDX index lines and size. :param int size: size """ # return if rec no longer exists (deleted while transfer is pending) if not self.redis.exists(self.info_key): return pending_count = self.PENDING_COUNT_KEY.format(rec=self.my_id) pending_size = self.PENDING_SIZE_KEY.format(rec=self.my_id) with redis_pipeline(self.redis) as pi: pi.incrby(pending_count, -1) pi.incrby(pending_size, -size) pi.expire(pending_count, self.PENDING_TTL) pi.expire(pending_size, self.PENDING_TTL)
def dec_pending_count_and_size(self, size): """Decrease outstanding CDX index lines and size. :param int size: size """ # return if rec no longer exists (deleted while transfer is pending) if not self.redis.exists(self.info_key): return pending_count = self.PENDING_COUNT_KEY.format(rec=self.my_id) pending_size = self.PENDING_SIZE_KEY.format(rec=self.my_id) with redis_pipeline(self.redis) as pi: pi.incrby(pending_count, -1) pi.incrby(pending_size, -size) pi.expire(pending_count, self.PENDING_TTL) pi.expire(pending_size, self.PENDING_TTL)
def _delete_redis_keys(self, type, user, coll, rec): key_templ = self.del_templ.get(type) if not key_templ: print('Unknown delete type ' + str(type)) return key_pattern = key_templ.format(user=user, coll=coll, rec=rec) keys_to_del = list(self.redis.scan_iter(match=key_pattern)) if type != 'user': del_info = self.info_keys[type].format(user=user, coll=coll, rec=rec) try: length = int(self.redis.hget(del_info, 'size')) except: print('Error decreasing size') return else: length = 0 with redis_pipeline(self.redis) as pi: if type == 'coll': coll_list_key = self.coll_list_key_templ.format(user=user) pi.srem(coll_list_key, coll) elif type == 'rec': rec_list_key = self.rec_list_key_templ.format(user=user, coll=coll) pi.srem(rec_list_key, rec) if length > 0: user_key = self.info_keys['user'].format(user=user) pi.hincrby(user_key, 'size', -length) if type == 'rec': coll_key = self.info_keys['coll'].format(user=user, coll=coll) pi.hincrby(coll_key, 'size', -length) for key in keys_to_del: pi.delete(key)
def prepare_response(self, environ, headers): super(RedisSessionMiddleware, self).prepare_response(environ, headers) session = environ['webrec.session'] if session.should_delete: self._delete_cookie(headers, self.sesh_key) self.redis.delete(session.key) else: if session.should_renew: self.redis.delete(session.key) sesh_id, session.key = self.make_id() session['id'] = sesh_id set_cookie = self.should_set_cookie(session) if set_cookie or session.should_save: with redis_pipeline(self.redis) as pi: self._update_redis_and_cookie(pi, set_cookie, session, headers)
def handle_upload(self, stream, upload_id, upload_key, infos, filename, user, force_coll, total_size): logger.debug('Begin handle_upload() from: ' + filename + ' force_coll: ' + str(force_coll)) num_recs = 0 num_recs = len(infos) # first info is for collection, not recording if num_recs >= 2: num_recs -= 1 logger.debug('Parsed {0} recordings, Buffer Size {1}'.format(num_recs, total_size)) first_coll, rec_infos = self.process_upload(user, force_coll, infos, stream, filename, total_size, num_recs) if not rec_infos: print('NO ARCHIVES!') #stream.close() return {'error_message': 'No Archive Data Found'} with redis_pipeline(self.manager.redis) as pi: pi.hset(upload_key, 'coll', first_coll['id']) pi.hset(upload_key, 'coll_title', first_coll['title']) pi.hset(upload_key, 'filename', filename) pi.expire(upload_key, self.upload_exp) self.launch_upload(self.run_upload, upload_key, filename, stream, user, rec_infos, total_size) return {'upload_id': upload_id, 'user': user }
def _init_upload_status(self, user, total_size, num_files, filename=None, expire=None): upload_id = self._get_upload_id() upload_key = self.UPLOAD_KEY.format(user=user.name, upid=upload_id) with redis_pipeline(self.redis) as pi: pi.hset(upload_key, 'size', 0) pi.hset(upload_key, 'total_size', total_size * 2) pi.hset(upload_key, 'total_files', num_files) pi.hset(upload_key, 'files', num_files) if filename: pi.hset(upload_key, 'filename', filename) if expire: pi.expire(upload_key, expire) return upload_id, upload_key
def _update_redis_and_cookie(self, set_cookie, session, headers): duration = self.durations[session.dura_type]['total'] if session.should_save: with redis_pipeline(self.redis) as pi: data = base64.b64encode(pickle.dumps(session._sesh)) ttl = session.ttl if ttl < 0: ttl = duration pi.setex(session.key, ttl, data) if set_cookie: self.track_long_term(session, pi) # set redis duration pi.expire(session.key, duration) if not set_cookie: return expires = datetime.utcnow() + timedelta(seconds=duration) # set cookie sesh_cookie = self.id_to_signed_cookie(session['id'], session.is_restricted) value = '{0}={1}; Path=/; HttpOnly; max-age={3}' value = value.format(self.sesh_key, sesh_cookie, datetime_to_http_date(expires), duration) scheme = session.environ.get('wsgi.url_scheme', '') if scheme.lower() == 'https': value += '; Secure' headers.append(('Set-Cookie', value))
def _init_upload_status(self, user, total_size, num_files, filename=None, expire=None): """Initialize upload status. :param User user: user :param int total_size: size of WARC archive :param int num_files: n.s. :param filename: WARC archive filename :type: str or None :param expire: upload TTL :type: int or None :returns: upload ID and upload Redis key :rtype: str and str """ upload_id = self._get_upload_id() upload_key = self.UPLOAD_KEY.format(user=user.name, upid=upload_id) with redis_pipeline(self.redis) as pi: pi.hset(upload_key, 'size', 0) pi.hset(upload_key, 'total_size', total_size * 2) pi.hset(upload_key, 'total_files', num_files) pi.hset(upload_key, 'files', num_files) if filename: pi.hset(upload_key, 'filename', filename) if expire: pi.expire(upload_key, expire) return upload_id, upload_key
def init_new(self, title='', desc='', rec_type=None, ra_list=None): rec = self._create_new_id() open_rec_key = self.OPEN_REC_KEY.format(rec=rec) self.data = { 'title': title, 'desc': desc, 'size': 0, } if rec_type: self.data['rec_type'] = rec_type with redis_pipeline(self.redis) as pi: self._init_new(pi) if ra_list: ra_key = self.RA_KEY.format(rec=self.my_id) pi.sadd(ra_key, *ra_list) pi.setex(open_rec_key, self.OPEN_REC_TTL, 1) return rec
def init_new(self, title='', desc='', rec_type=None, ra_list=None): """Initialize new recording Redis building block. :param str title: title :param str desc: description :param rec_type: type of recording :type: str or None :param ra_list: remote archives :type: list or None :returns: component ID :rtype: str """ rec = self._create_new_id() open_rec_key = self.OPEN_REC_KEY.format(rec=rec) self.data = { 'title': title, 'desc': desc, 'size': 0, } if rec_type: self.data['rec_type'] = rec_type with redis_pipeline(self.redis) as pi: self._init_new(pi) if ra_list: ra_key = self.RA_KEY.format(rec=self.my_id) pi.sadd(ra_key, *ra_list) pi.setex(open_rec_key, self.OPEN_REC_TTL, 1) return rec
def init_new(self, title='', desc='', rec_type=None, ra_list=None): """Initialize new recording Redis building block. :param str title: title :param str desc: description :param rec_type: type of recording :type: str or None :param ra_list: remote archives :type: list or None :returns: component ID :rtype: str """ rec = self._create_new_id() open_rec_key = self.OPEN_REC_KEY.format(rec=rec) self.data = { 'title': title, 'desc': desc, 'size': 0, } if rec_type: self.data['rec_type'] = rec_type with redis_pipeline(self.redis) as pi: self._init_new(pi) if ra_list: ra_key = self.RA_KEY.format(rec=self.my_id) pi.sadd(ra_key, *ra_list) pi.setex(open_rec_key, self.OPEN_REC_TTL, 1) return rec
def run_upload(self, upload_key, filename, stream, user, rec_infos, total_size, first_coll): """Upload WARC archive. :param str upload_key: upload Redis key :param str filename: WARC archive filename :param stream: file object :param User user: user :param list rec_infos: list of recordings :param int total_size: size of WARC archive :param Collection first_coll: collection """ try: count = 0 num_recs = len(rec_infos) last_end = 0 page_id_map = {} for info in rec_infos: count += 1 logger.debug('Id: {0}, Uploading Rec {1} of {2}'.format( upload_key, count, num_recs)) if info['length'] > 0: self.do_upload(upload_key, filename, stream, user.name, info['coll'], info['rec'], info['offset'], info['length']) else: logger.debug('SKIP upload for zero-length recording') # BEGIN PERMA CUSTOMIZATION # investigating https://github.com/harvard-lil/perma/issues/2602 try: self.process_pages(info, page_id_map) except Exception as e: raise Exception("Exception processing pages for {}".format( first_coll.name)) from e # END PERMA CUSTOMIZATION diff = info['offset'] - last_end last_end = info['offset'] + info['length'] if diff > 0: self._add_split_padding(diff, upload_key) recording = info['recording'] recording.set_date_prop('created_at', info) recording.set_date_prop('recorded_at', info) recording.set_date_prop('updated_at', info) self.import_lists(first_coll, page_id_map) self.postprocess_coll(first_coll) first_coll.set_date_prop('created_at', first_coll.data, '_created_at') first_coll.set_date_prop('updated_at', first_coll.data, '_updated_at') except: traceback.print_exc() finally: # add remainder of file, assumed consumed/skipped, if any last_end = stream.tell() stream.close() if last_end < total_size: diff = total_size - last_end self._add_split_padding(diff, upload_key) with redis_pipeline(self.redis) as pi: pi.hincrby(upload_key, 'files', -1) pi.hset(upload_key, 'done', 1) if first_coll.is_external(): first_coll.sync_coll_index(exists=False, do_async=False) first_coll.set_external_remove_on_expire()
def run_upload(self, upload_key, filename, stream, user, rec_infos, total_size, first_coll): """Upload WARC archive. :param str upload_key: upload Redis key :param str filename: WARC archive filename :param stream: file object :param User user: user :param list rec_infos: list of recordings :param int total_size: size of WARC archive :param Collection first_coll: collection """ try: count = 0 num_recs = len(rec_infos) last_end = 0 page_id_map = {} for info in rec_infos: count += 1 logger.debug('Id: {0}, Uploading Rec {1} of {2}'.format(upload_key, count, num_recs)) if info['length'] > 0: self.do_upload(upload_key, filename, stream, user.name, info['coll'], info['rec'], info['offset'], info['length']) else: logger.debug('SKIP upload for zero-length recording') self.process_pages(info, page_id_map) diff = info['offset'] - last_end last_end = info['offset'] + info['length'] if diff > 0: self._add_split_padding(diff, upload_key) recording = info['recording'] recording.set_date_prop('created_at', info) recording.set_date_prop('recorded_at', info) recording.set_date_prop('updated_at', info) self.import_lists(first_coll, page_id_map) self.postprocess_coll(first_coll) first_coll.set_date_prop('created_at', first_coll.data, '_created_at') first_coll.set_date_prop('updated_at', first_coll.data, '_updated_at') except: traceback.print_exc() finally: # add remainder of file, assumed consumed/skipped, if any last_end = stream.tell() stream.close() if last_end < total_size: diff = total_size - last_end self._add_split_padding(diff, upload_key) with redis_pipeline(self.redis) as pi: pi.hincrby(upload_key, 'files', -1) pi.hset(upload_key, 'done', 1) if first_coll.is_external(): first_coll.sync_coll_index(exists=False, do_async=False) first_coll.set_external_remove_on_expire()
def run_upload(self, upload_key, filename, stream, user, rec_infos, total_size, first_coll): """Upload WARC archive. :param str upload_key: upload Redis key :param str filename: WARC archive filename :param stream: file object :param User user: user :param list rec_infos: list of recordings :param int total_size: size of WARC archive :param Collection first_coll: collection """ try: count = 0 num_recs = len(rec_infos) last_end = 0 page_id_map = {} for info in rec_infos: count += 1 logger.debug('Id: {0}, Uploading Rec {1} of {2}'.format( upload_key, count, num_recs)) if info['length'] > 0: self.do_upload(upload_key, filename, stream, user.name, info['coll'], info['rec'], info['offset'], info['length']) else: logger.debug('SKIP upload for zero-length recording') self.process_pages(info, page_id_map) diff = info['offset'] - last_end last_end = info['offset'] + info['length'] if diff > 0: self._add_split_padding(diff, upload_key) recording = info['recording'] recording.set_date_prop('created_at', info) recording.set_date_prop('recorded_at', info) recording.set_date_prop('updated_at', info) self.import_lists(first_coll, page_id_map) self.postprocess_coll(first_coll) first_coll.set_date_prop('created_at', first_coll.data, '_created_at') first_coll.set_date_prop('updated_at', first_coll.data, '_updated_at') except: traceback.print_exc() finally: # add remainder of file, assumed consumed/skipped, if any last_end = stream.tell() stream.close() if last_end < total_size: diff = total_size - last_end self._add_split_padding(diff, upload_key) with redis_pipeline(self.redis) as pi: pi.hincrby(upload_key, 'files', -1) pi.hset(upload_key, 'done', 1)
def create_user(m, email=None, username=None, passwd=None, role=None, name=None): """Create a new user with command line arguments or series of prompts, preforming basic validation """ users = m.get_users() if not email: print('let\'s create a new user..') email = input('email: ').strip() # validate email if not re.match(r'[\w.-/+]+@[\w.-]+.\w+', email): print('valid email required!') return if email in [data['email_addr'] for u, data in users.items()]: print('A user already exists with {0} email!'.format(email)) return username = username or input('username: '******'please enter a username!') return if not m.USER_RX.match(username) or username in m.RESTRICTED_NAMES: print('Invalid username..') return if username in users: print('Username already exists..') return name = name if name is not None else input('name (optional): ').strip() role = role if role in [r[0] for r in m.cork.list_roles()] else choose_role(m) if passwd is not None: passwd2 = passwd else: passwd = getpass('password: '******'repeat password: '******'Passwords must match and be at least 8 characters long ' 'with lowercase, uppercase, and either digits or symbols.') return print('Creating user {username} with the email {email} and the role: ' '\'{role}\''.format(username=username, email=email, role=role)) # add user to cork m.cork._store.users[username] = { 'role': role, 'hash': m.cork._hash(username, passwd).decode('ascii'), 'email_addr': email, 'desc': '{{"name":"{name}"}}'.format(name=name), 'creation_date': str(datetime.utcnow()), 'last_login': str(datetime.utcnow()), } m.cork._store.save_users() # add user account defaults key = m.user_key.format(user=username) now = int(time.time()) max_size, max_coll = m.redis.hmget('h:defaults', ['max_size', 'max_coll']) if not max_size: max_size = m.default_max_size if not max_coll: max_coll = m.default_max_coll with redis_pipeline(m.redis) as pi: pi.hset(key, 'max_size', max_size) pi.hset(key, 'max_coll', max_coll) pi.hset(key, 'created_at', now) pi.hset(key, 'name', name) pi.hsetnx(key, 'size', '0') if m.default_coll: # create initial collection m.create_collection(username, coll=m.default_coll['id'], coll_title=m.default_coll['title'], desc=m.default_coll['desc'].format(username), public=False) # email subscription set up? if m.mailing_list: m.add_to_mailing_list(username, email, name) print('All done!')
def upload_file(self): stream = None temp_file = None logger.debug('Upload Begin') expected_size = int(request.headers['Content-Length']) logger.debug('Expected Size: ' + str(expected_size)) if not expected_size: return {'error_message': 'No File Specified'} curr_user = self.manager.get_curr_user() if not curr_user: #user = self.manager.get_anon_user() #force_coll = 'temp' #is_anon = True return { 'error_message': 'Sorry, uploads only available for logged-in users' } user = curr_user force_coll = request.query.getunicode('force-coll', '') is_anon = False size_rem = self.manager.get_size_remaining(user) logger.debug('User Size Rem: ' + str(size_rem)) if size_rem < expected_size: return { 'error_message': 'Sorry, not enough space to upload this file' } if force_coll and not self.manager.has_collection(user, force_coll): if is_anon: self.manager.create_collection(user, force_coll, 'Temporary Collection') else: status = 'Collection {0} not found'.format(force_coll) return {'error_message': status} temp_file = SpooledTemporaryFile(max_size=BLOCK_SIZE) filename = request.query.getunicode('filename') stream = request.environ['wsgi.input'] stream = CacheingLimitReader(stream, expected_size, temp_file) if filename.endswith('.har'): stream, expected_size = self.har2warc(filename, stream) temp_file.close() temp_file = stream infos = self.parse_uploaded(stream, expected_size) total_size = temp_file.tell() if total_size != expected_size: return { 'error_message': 'size mismatch: expected {0}, got {1}'.format( expected_size, total_size) } upload_id = self._get_upload_id() upload_key = self.upload_key.format(user=user, upid=upload_id) with redis_pipeline(self.manager.redis) as pi: pi.hset(upload_key, 'size', 0) pi.hset(upload_key, 'total_size', total_size * 2) pi.hset(upload_key, 'filename', filename) pi.hset(upload_key, 'total_files', 1) pi.hset(upload_key, 'files', 1) return self.handle_upload(temp_file, upload_id, upload_key, infos, filename, user, force_coll, total_size)
def rename(self): from_user = request.query.getunicode('from_user', '') from_coll = request.query.getunicode('from_coll', '') from_rec = request.query.getunicode('from_rec', '*') to_user = request.query.getunicode('to_user', '') to_coll = request.query.getunicode('to_coll', '') to_rec = request.query.getunicode('to_rec', '*') to_title = request.query.getunicode('to_title', '') if not from_user or not from_coll or not to_user or not to_coll: return {'error_message': 'user or coll params missing'} if (from_rec == '*' or to_rec == '*') and (from_rec != to_rec): return { 'error_message': 'must specify rec name or "*" if moving entire coll' } # Move the redis keys, this performs the move as far as user is concerned match_pattern = ':' + from_user + ':' + from_coll + ':' replace_pattern = ':' + to_user + ':' + to_coll + ':' if to_rec != '*': match_pattern += from_rec + ':' replace_pattern += to_rec + ':' moves = {} for key in self.redis.scan_iter(match='*' + match_pattern + '*'): key = key.decode('utf-8') moves[key] = key.replace(match_pattern, replace_pattern) # Get Info Keys to_user_key = self.info_keys['user'].format(user=to_user) from_user_key = self.info_keys['user'].format(user=from_user) if to_rec != '*': to_coll_key = self.info_keys['coll'].format(user=to_user, coll=to_coll) from_coll_key = self.info_keys['coll'].format(user=from_user, coll=from_coll) to_coll_list_key = self.rec_list_key_templ.format(user=to_user, coll=to_coll) from_coll_list_key = self.rec_list_key_templ.format(user=from_user, coll=from_coll) info_key = self.info_keys['rec'].format(user=from_user, coll=from_coll, rec=from_rec) to_id = to_rec else: info_key = self.info_keys['coll'].format(user=from_user, coll=from_coll) to_id = to_coll the_size = int(self.redis.hget(info_key, 'size')) with redis_pipeline(self.redis) as pi: # Fix Id pi.hset(info_key, 'id', to_id) # Change title, if provided if to_title: pi.hset(info_key, 'title', to_title) # actual rename for from_key, to_key in iteritems(moves): pi.rename(from_key, to_key) with redis_pipeline(self.redis) as pi: # change user size, if different users if to_user_key != from_user_key: pi.hincrby(from_user_key, 'size', -the_size) pi.hincrby(to_user_key, 'size', the_size) # change coll size if moving rec and different colls if to_rec != '*' and to_coll_key != from_coll_key: pi.hincrby(from_coll_key, 'size', -the_size) pi.hincrby(to_coll_key, 'size', the_size) if to_rec != '*': pi.srem(from_coll_list_key, from_rec) pi.sadd(to_coll_list_key, to_rec) # rename WARCs (only if switching users) replace_list = [] for key, name, url in self._iter_all_warcs(to_user, to_coll, to_rec): if not url.startswith(self.full_warc_prefix): continue filename = url[len(self.full_warc_prefix):] new_filename = filename.replace(from_user + '/', to_user + '/') repl = dict(key=key, name=name, old_v=filename, new_v=new_filename) replace_list.append(repl) if replace_list: if not self.queue_message('rename', {'replace_list': replace_list}): return {'error_message': 'no local clients'} #if self.storage_committer: # storage = self.storage_committer.get_storage(to_user, to_coll, to_rec) # if storage and not storage.rename(from_user, from_coll, from_rec, # to_user, to_coll, to_rec): # return {'error_message': 'remote rename failed'} return {'success': to_user + ':' + to_coll + ':' + to_rec}
def process_upload(self, user, force_coll_name, infos, stream, filename, total_size, num_recs, upload_key): """Process WARC archive. :param User user: user :param str force_coll_name: name of collection to upload into :param list infos: list of recordings (indices) :param stream: file object :param str filename: WARC archive filename :param int total_size: WARC archive size :param int num_recs: number of recordings :returns: collection and recordings :rtype: Collection and list """ stream.seek(0) count = 0 first_coll = None collection = None recording = None if force_coll_name: collection = user.get_collection_by_name(force_coll_name) rec_infos = [] lists = None for info in infos: type = info.get('type') if type == 'collection': if not collection: collection = self.make_collection(user, filename, info) lists = info.get('lists') i=0 if type == 'recording': if not collection: collection = self.make_collection(user, filename, self.upload_coll_info, info) desc = info.get('desc', '') print("aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa") url=info.get('pages', None)[0].get('url', None) if url and i == 0 : i+=1 collection = user.get_collection_by_name(force_coll_name) collection['url'] = url with redis_pipeline(self.redis) as pi: pi.hset(upload_key, 'url', url) print(collection['url']) print(collection['title']) collection.mark_updated() # if title was auto-generated for compatibility on export, # set title to blank if info.get('auto_title'): title = '' else: title = info.get('title', '') recording = collection.create_recording(title=title, desc=desc, rec_type=info.get('rec_type'), ra_list=info.get('ra')) info['id'] = recording.my_id count += 1 #yield collection, recording logger.debug('Processing Upload Rec {0} of {1}'.format(count, num_recs)) rec_infos.append({'coll': collection.my_id, 'rec': recording.my_id, 'offset': info['offset'], 'length': info['length'], 'pages': info.get('pages', None), 'collection': collection, 'recording': recording, 'created_at': info.get('created_at'), 'updated_at': info.get('updated_at'), 'recorded_at': info.get('recorded_at', info.get('updated_at')), }) if not first_coll: first_coll = collection if lists: collection.data['_lists'] = lists return first_coll, rec_infos