def delete_expired_external(self): """ Delete any expired external collections in non-temp users """ all_ext_templ = Collection.EXTERNAL_KEY.format(coll='*') for ext_key in self.data_redis.scan_iter(all_ext_templ): try: _, coll, _2 = ext_key.split(':', 2) collection = Collection(my_id=coll, redis=self.data_redis, access=BaseAccess()) user = collection.get_owner() if not user or user.is_anon(): continue if not collection.has_cdxj(): logger.debug( 'TempChecker: Delete Expired External Coll: ' + collection.name) user.remove_collection(collection, delete=True) except Exception: import traceback traceback.print_exc()
def process_cdxj_key(self, cdxj_key): _, rec, _2 = cdxj_key.split(':', 2) recording = Recording(my_id=rec, redis=self.redis, access=BaseAccess()) if not recording.is_open(extend=False): recording.commit_to_storage()
def create_write_buffer(self, params, name): rec_id = params.get('param.recorder.rec') or params.get('param.rec') recording = Recording(my_id=rec_id, redis=self.redis, access=BaseAccess()) params['recording'] = recording return TempWriteBuffer(recording, name, params['url'])
def delete_if_expired(self, temp_user, temp_dir): temp_key = 't:' + temp_user sesh = self.sesh_redis.get(temp_key) if sesh == 'commit-wait': try: if not os.path.isdir(temp_dir): logger.debug( 'TempChecker: Remove Session For Already Deleted Dir: ' + temp_dir) self.sesh_redis.delete(temp_key) return True logger.debug('TempChecker: Removing if empty: ' + temp_dir) os.rmdir(temp_dir) #shutil.rmtree(temp_dir) logger.debug('TempChecker: Deleted empty dir: ' + temp_dir) self.sesh_redis.delete(temp_key) except Exception as e: logger.debug('TempChecker: Waiting for commit') return False # temp user key exists elif self.data_redis.exists(User.INFO_KEY.format(user=temp_user)): # if user still active, don't remove if self.sesh_redis.get(self.sesh_key_template.format(sesh)): #print('Skipping active temp ' + temp) return False # delete user logger.debug('TempChecker: Deleting expired user: '******'TempChecker: Deleted expired temp dir: ' + temp_dir) shutil.rmtree(temp_dir) except Exception as e: logger.warn(str(e)) return False return True
def process_cdxj_key(self, cdxj_key): _, rec, _2 = cdxj_key.split(':', 2) recording = Recording(my_id=rec, redis=self.redis, access=BaseAccess()) if not recording.get_owner(): logger.debug('Deleting Invalid Rec: ' + recording.my_id) recording.delete_object() return if not recording.is_open(extend=False): recording.commit_to_storage()
def __init__(self, redis_url=None): config = load_wr_config() self.base_access = BaseAccess() # Init Redis if not redis_url: redis_url = os.environ['REDIS_BASE_URL'] r = redis.StrictRedis.from_url(redis_url, decode_responses=True) # Init Cork cork = WebRecCork.create_cork(r, config) super(CLIUserManager, self).__init__(redis=r, cork=cork, config=config)
def test_sync_avoid_double_load(self): self.assert_exists(COLL_CDXJ, False)() self.assert_exists(REC_CDXJ, False)() collection = User(redis=self.redis, my_id=self.anon_user, access=BaseAccess()).get_collection_by_name('temp') collection.sync_coll_index(exists=False, do_async=True) time.sleep(0.1) self.assert_exists(REC_CDXJ_T, True)() collection.sync_coll_index(exists=True, do_async=True) time.sleep(0.1) self.assert_exists(REC_CDXJ_T, True)() self.sleep_try(0.1, 0.5, self.assert_exists(REC_CDXJ_T, False)) assert load_counter == 1
def delete_if_expired(self, temp_user, temp_dir): temp_key = 't:' + temp_user sesh = self.sesh_redis.get(temp_key) if sesh == 'commit-wait': # This temporary user has signed up for a permanent account and # their collections will be migrated to storage. # Clean up if that migration is complete (i.e. the dir is empty). # Otherwise, wait. if os.path.isdir(temp_dir): try: logger.debug('TempChecker: Removing if empty: ' + temp_dir) os.rmdir(temp_dir) logger.debug('TempChecker: Deleted empty dir: ' + temp_dir) except OSError as e: if e.errno == errno.ENOTEMPTY: logger.debug('TempChecker: Waiting for commit') elif e.errno != errno.ENOENT: logger.error(str(e)) return False else: logger.debug( 'TempChecker: Removing Session For Already Deleted Dir: ' + temp_dir) self.sesh_redis.delete(temp_key) return True # temp user key exists elif self.data_redis.exists(User.INFO_KEY.format(user=temp_user)): # if user still active, don't remove if self.sesh_redis.get(self.sesh_key_template.format(sesh)): return False logger.debug('TempChecker: Deleting expired user: '******'s open recordings "closed"; # return (if necessary) to give time for closing logic to complete wait_to_delete = False for collection in user.get_collections(load=False): for recording in collection.get_recordings(load=False): if recording.is_open(extend=False): recording.set_closed() logger.debug('TempChecker: Closing temp recording: ' + recording.my_id) wait_to_delete = True if wait_to_delete: return False # delete the user; signal that the user's collections should be deleted. # the temp dir containing those collections will be deleted on next pass. user.delete_me() # delete the session self.sesh_redis.delete(temp_key) return True # no user session, remove temp dir and everything in it else: try: logger.debug('TempChecker: Deleted expired temp dir: ' + temp_dir) shutil.rmtree(temp_dir) except OSError as e: if e.errno != errno.ENOENT: logger.error(str(e)) return False return True
def __init__(self, redis): self.redis = redis self.access = BaseAccess() self.users = UserTable(self.redis, self.get_access) self.roles = RedisTable(self.redis, 'h:roles') self.pending_registrations = RedisTable(self.redis, 'h:register')
def process_new_pages(self): crawl_groups = {} while True: data = self.redis.rpop(Collection.NEW_PAGES_Q) if not data: break page_data = json.loads(data) rec = page_data['rec'] if rec not in crawl_groups: crawl_groups[rec] = { 'user': page_data['user'], 'coll': page_data['coll'], 'coll_name': page_data['coll_name'], 'pages': [] } crawl_groups[rec]['pages'].append({ 'pid': page_data['pid'], 'url': page_data['url'], 'timestamp': page_data['timestamp'], 'title': page_data.get('title'), }) if page_data.get('derivs_rec'): crawl_groups[rec]['derivs_rec'] = page_data.get('derivs_rec') for rec, data in crawl_groups.items(): user = User(my_id=data['user'], redis=self.redis, access=BaseAccess()) if not user: print('Invalid User: '******'coll_name']) if not collection: print('Invalid Collection: ' + data['coll_name']) continue recording = collection.get_recording(rec) # if a specific derivates recording is provided, use that derivs_rec = data.get('derivs_rec') # otherwise create derivates recording if none exists if not derivs_rec: derivs_recording = recording.get_derivs_recording() if not derivs_recording: title = 'Derivatives for: Session from ' + recording.to_iso_date( recording['created_at'], no_T=True) derivs_recording = collection.create_recording( title=title, rec_type='derivs') recording.set_derivs_recording(derivs_recording) derivs_rec = derivs_recording.my_id crawl_def = SEARCH_CRAWL_DEF.copy() crawl_def['coll'] = crawl_def['screenshot_coll'] = crawl_def[ 'text_coll'] = data['coll'] crawl_def['user_params'] = { 'user': data['user'], 'coll': data['coll'], 'coll_name': data['coll_name'], 'rec': derivs_rec, 'type': 'replay-coll', # updated later 'request_ts': '', 'browser': BROWSER } crawl_def['name'] = 'text-' + data['user'] + '-' + data['coll'] crawl_def['seed_urls'] = data['pages'] print(crawl_def) r = requests.post(self.browsertrix_url, json=crawl_def) print(r.text)
def _get_access(self): if self.admin_override: self.admin_override = False return BaseAccess() else: return request['webrec.access']