def exposed_delete_spcnet_invalid_url_pages(): ''' So the spcnet.tv forum software generates THOUSANDS of garbage links somehow. Anyways, delete those. ''' sess = db.get_db_session() tables = [ db.WebPages.__table__, version_table(db.WebPages) ] for ctbl in tables: # Print Querying for affected rows q = sess.query(ctbl.c.id) \ .filter(ctbl.c.netloc == "www.spcnet.tv") \ .filter(ctbl.c.content.like('%Invalid Forum specified. If you followed a valid link, please notify the%')) print("Query:") print(q) ids = q.all() ids = set(ids) # Returned list of IDs is each ID packed into a 1-tuple. Unwrap those tuples so it's just a list of integer IDs. ids = [tmp[0] for tmp in ids] print("Fount %s rows requring deletion. Deleting." % len(ids)) delete_internal(sess, ids) sess.commit()
def checkHaveHistory(self, url): ctbl = version_table(db.WebPages) count = self.db_sess.query(ctbl) \ .filter(ctbl.c.url == url) \ .count() return count
def relink_row_sequence(self, rows): ''' Each Sqlalchemy-Continum transaction references the next transaction in the chain as it's `end_transaction_id` except the most recent (where the value of end_transaction_id is `None`) Therefore, we iterate over the history in reverse, and for each item set it's `end_transaction_id` to the id of the next transaction, so the history linked list works correctly. ''' ctbl = version_table(db.WebPages) rows.sort(reverse=True, key=lambda x: (x.id, x.transaction_id, x.end_transaction_id)) end_transaction_id = None dirty = False for x in rows: if x.end_transaction_id != end_transaction_id: self.log.info("Need to change end_transaction_id from %s to %s", x.end_transaction_id, end_transaction_id) update = ctbl.update().where(ctbl.c.id == x.id).where(ctbl.c.transaction_id == x.transaction_id).values(end_transaction_id=end_transaction_id) # print(update) self.sess.execute(update) dirty = True end_transaction_id = x.transaction_id return dirty
def relink_row_sequence(self, sess, rows): ''' Each Sqlalchemy-Continum transaction references the next transaction in the chain as it's `end_transaction_id` except the most recent (where the value of end_transaction_id is `None`) Therefore, we iterate over the history in reverse, and for each item set it's `end_transaction_id` to the id of the next transaction, so the history linked list works correctly. ''' ctbl = version_table(db.WebPages) rows.sort(reverse=True, key=lambda x: (x.id, x.transaction_id, x.end_transaction_id)) end_transaction_id = None dirty = False for x in rows: if x.end_transaction_id != end_transaction_id: self.log.info( "Need to change end_transaction_id from %s to %s", x.end_transaction_id, end_transaction_id) update = ctbl.update().where(ctbl.c.id == x.id).where( ctbl.c.transaction_id == x.transaction_id).values( end_transaction_id=end_transaction_id) # print(update) sess.execute(update) dirty = True end_transaction_id = x.transaction_id return dirty
def checkHaveHistory(self, url): ctbl = version_table(self.db.RawWebPages) count = self.db_sess.query(ctbl) \ .filter(ctbl.c.url == url) \ .count() return count
def render(): req_url = request.args.get('url') if not req_url: return build_error_response(message="Error! No page specified!") req_url = request.args.get('url') version = request.args.get('version') ignore_cache = request.args.get("nocache") filterstate = get_filter_state_for_url(req_url) try: if version == "None": version = None else: version = ast.literal_eval(version) except ValueError: return build_error_response( message="Error! Historical version number must be an integer!") if version and ignore_cache: return build_error_response( message="Error! Cannot render a historical version with nocache!") if version: rid, tid = version print("Historical row id: ", rid, tid) ctbl = version_table(db.WebPages) rows = g.session.query(ctbl.c.title, ctbl.c.content) \ .filter(ctbl.c.id == rid) \ .filter(ctbl.c.transaction_id == tid) \ .all() if rows: row = rows.pop() title, content = row content = utilities.replace_links(content) cachestate = "Historical version: %s" % (version, ) else: title, content, cachestate = WebMirror.API.getPage( req_url, ignore_cache=ignore_cache, version=version) # print("Render-Version: ", version, type(version)) # print("Rendering with nocache=", ignore_cache) # print("Return:", cachestate) response = jsonify( title=title, contents=content, cachestate=cachestate, filterstate=filterstate, req_url=req_url, ) return set_cache_control_headers(response)
def checkHaveHistory(self, url): # Only do version fiddling if versioning is enabled. if not settings.DO_VERSIONING: return 99 ctbl = version_table(db.WebPages) count = self.db_sess.query(ctbl) \ .filter(ctbl.c.url == url) \ .count() return count
def exposed_delete_gravitytales_bot_blocked_pages(): ''' Delete the "checking you're not a bot" garbage pages that sometimes get through the gravitytales scraper. ''' sess = db.get_db_session() tables = [db.WebPages.__table__, version_table(db.WebPages)] for ctbl in tables: update = ctbl.delete() \ .where(ctbl.c.netloc == "gravitytales.com") \ .where(ctbl.c.content.like('%<div id="bot-alert" class="alert alert-info">%')) print(update) sess.execute(update) sess.commit()
def delete_internal(sess, ids): if ids: print("Doint delete. %s rows requiring update." % (len(ids), )) else: print("No rows needing deletion.") return ctbl = version_table(db.WebPages) chunk_size = 5000 for chunk_idx in range(0, len(ids), chunk_size): chunk = ids[chunk_idx:chunk_idx+chunk_size] while 1: try: # Allow ids that only exist in the history table by falling back to a # history-table query if the main table doesn't have the ID. try: ex = sess.query(db.WebPages.url).filter(db.WebPages.id == chunk[0]).one()[0] except sqlalchemy.orm.exc.NoResultFound: ex = sess.query(ctbl.c.url).filter(ctbl.c.id == chunk[0]).all()[0][0] print("Example removed URL: '%s'" % (ex)) q1 = sess.query(db.WebPages).filter(db.WebPages.id.in_(chunk)) affected_rows_main = q1.delete(synchronize_session=False) q2 = sess.query(ctbl).filter(ctbl.c.id.in_(chunk)) affected_rows_ver = q2.delete(synchronize_session=False) sess.commit() print("Deleted %s rows (%s version table rows). %0.2f%% done." % (affected_rows_main, affected_rows_ver, 100 * ((chunk_idx) / len(ids)))) break except sqlalchemy.exc.InternalError: print("Transaction error (sqlalchemy.exc.InternalError). Retrying.") sess.rollback() except sqlalchemy.exc.OperationalError: print("Transaction error (sqlalchemy.exc.OperationalError). Retrying.") sess.rollback() except sqlalchemy.exc.IntegrityError: print("Transaction error (sqlalchemy.exc.IntegrityError). Retrying.") sess.rollback() except sqlalchemy.exc.InvalidRequestError: print("Transaction error (sqlalchemy.exc.InvalidRequestError). Retrying.") traceback.print_exc() sess.rollback()
def view_history(): req_url = request.args.get('url') if not req_url: return render_template('error.html', title='Viewer', message="Error! No page specified!") version = request.args.get('version') delete_id = request.args.get("delete_id") delete = request.args.get("delete") print(version, delete_id, delete) if version and not (delete_id or delete): return render_template('view.html', title='Rendering Content', req_url=req_url, version=version) versions = [] ctbl = version_table(db.WebPages) versions = g.session.query(ctbl.c.id, ctbl.c.state, ctbl.c.fetchtime, ctbl.c.transaction_id) \ .filter(ctbl.c.url == req_url) \ .order_by(ctbl.c.fetchtime) \ .all() versions = list(enumerate(versions)) versions.reverse() if delete_id and delete: return do_history_delete(versions, version, delete_id, delete) return render_template('history.html', title='Item History', req_url=req_url, versions=versions)
def purge_invalid_urls(selected_netloc=None): sess = db.get_db_session() for ruleset in WebMirror.rules.load_rules(): if ( (ruleset['netlocs'] and ruleset['badwords']) and ( (ruleset['netlocs'] and ruleset['badwords'] and selected_netloc is None) or (selected_netloc != None and selected_netloc in ruleset['netlocs']) ) ): # We have to delete from the normal table before the versioning table, # because deleting from the normal table causes inserts into the versioning table # due to the change tracking triggers. count = 1 loc = and_( db.WebPages.netloc.in_(ruleset['netlocs']), or_(*(db.WebPages.url.like("%{}%".format(badword)) for badword in ruleset['badwords'])) ) # print("Doing count on table ") # count = sess.query(db.WebPages) \ # .filter(or_(*opts)) \ # .count() if count == 0: print("{num} items match badwords from file {file}. No deletion required ".format(file=ruleset['filename'], num=count)) else: print("{num} items match badwords from file {file}. Deleting ".format(file=ruleset['filename'], num=count)) sess.query(db.WebPages) \ .filter(or_(*loc)) \ .delete(synchronize_session=False) # # Do the delete from the versioning table now. ctbl = version_table(db.WebPages) loc2 = and_( ctbl.c.netloc.in_(ruleset['netlocs']), or_(*(ctbl.c.url.like("%{}%".format(badword)) for badword in ruleset['badwords'])) ) # print("Doing count on Versioning table ") # count = sess.query(ctbl) \ # .filter(or_(*opts)) \ # .count() if count == 0: print("{num} items in versioning table match badwords from file {file}. No deletion required ".format(file=ruleset['filename'], num=count)) else: print("{num} items in versioning table match badwords from file {file}. Deleting ".format(file=ruleset['filename'], num=count)) sess.query(ctbl) \ .filter(or_(*loc2)) \ .delete(synchronize_session=False) sess.commit()
def do_history_delete(versions, version, delete_id, delete): ctbl = version_table(db.WebPages) if delete != "True": return render_template('error.html', title='Error when deleting!', message="Delete param not true?") try: version = int(version) except ValueError: return render_template( 'error.html', title='Error when deleting!', message="Cannot convert version value to integer!") try: delete_id = int(delete_id) except ValueError: return render_template( 'error.html', title='Error when deleting!', message="Cannot convert delete_id value to integer!") versions = dict(versions) if delete_id == -1 and version == -1: maxid = max(versions.keys()) for vid, version in versions.items(): if vid != maxid: rid, tid = version.id, version.transaction_id print("Deleting:", version, rid, tid) g.session.query(ctbl) \ .filter(ctbl.c.id == rid) \ .filter(ctbl.c.transaction_id == tid) \ .delete(synchronize_session=False) g.session.commit() return render_template('error.html', title='All old versions deleted', message="All old versions deleted") else: if not version in versions: return render_template( 'error.html', title='Error when deleting!', message="Version value doesn't exist? ('%s', '%s')" % (version, type(version))) target = versions[version] if not target.id == delete_id: return render_template( 'error.html', title='Error when deleting!', message="Delete row PK Id doesn't match specified delete ID?") print("Deleting:", target) g.session.query(ctbl) \ .filter(ctbl.c.id == target.id) \ .filter(ctbl.c.transaction_id == target.transaction_id) \ .delete(synchronize_session=False) g.session.commit() return render_template('error.html', title='Row deleted', message="Row: '%s', '%s'" % (delete_id, version))
def exposed_purge_invalid_urls(selected_netloc=None): ''' Iterate over each ruleset in the rules directory, and generate a compound query that will delete any matching rows. For rulesets with a large number of rows, or many badwords, this can be VERY slow. Similar in functionality to `clear_bad`, except it results in many fewer queryies, and is therefore likely much more performant. ''' sess = db.get_db_session() for ruleset in WebMirror.rules.load_rules(): if ( (ruleset['netlocs'] and ruleset['badwords']) and ( (ruleset['netlocs'] and ruleset['badwords'] and selected_netloc is None) or (selected_netloc != None and selected_netloc in ruleset['netlocs']) ) ): # We have to delete from the normal table before the versioning table, # because deleting from the normal table causes inserts into the versioning table # due to the change tracking triggers. count = 1 ands = [ or_(*(db.WebPages.url.like("%{}%".format(badword)) for badword in ruleset['badwords'])) ] if selected_netloc: ands.append((db.WebPages.netloc == selected_netloc)) else: ands.append(db.WebPages.netloc.in_(ruleset['netlocs'])) loc = and_(*ands) # print("Doing count on table ") # count = sess.query(db.WebPages) \ # .filter(or_(*opts)) \ # .count() if selected_netloc: print(loc) if count == 0: print("{num} items match badwords from file {file}. No deletion required ".format(file=ruleset['filename'], num=count)) else: print("{num} items match badwords from file {file}. Deleting ".format(file=ruleset['filename'], num=count)) sess.query(db.WebPages) \ .filter(or_(*loc)) \ .delete(synchronize_session=False) # # Do the delete from the versioning table now. ctbl = version_table(db.WebPages) loc2 = and_( ctbl.c.netloc.in_(ruleset['netlocs']), or_(*(ctbl.c.url.like("%{}%".format(badword)) for badword in ruleset['badwords'])) ) # print("Doing count on Versioning table ") # count = sess.query(ctbl) \ # .filter(or_(*opts)) \ # .count() if count == 0: print("{num} items in versioning table match badwords from file {file}. No deletion required ".format(file=ruleset['filename'], num=count)) else: print("{num} items in versioning table match badwords from file {file}. Deleting ".format(file=ruleset['filename'], num=count)) sess.query(ctbl) \ .filter(or_(*loc2)) \ .delete(synchronize_session=False) sess.commit()
def truncate_url_history(self, url): ctbl = version_table(db.WebPages) items = self.sess.query(ctbl) \ .filter(ctbl.c.url == url) \ .all() items.sort(key=lambda x: (x.id, x.transaction_id, x.end_transaction_id)) # for x in items: # print(x.id, x.transaction_id, x.end_transaction_id) deleted_1 = 0 deleted_2 = 0 orig_cnt = len(items) datevec = self.snap_times self.log.info("Clearing history for URL: %s (items: %s)", url, orig_cnt) attachments = {} for item in items: if item.state != "complete": deleted_1 += 1 self.log.info("Deleting incomplete item for url: %s!", url) self.sess.execute(ctbl.delete().where(ctbl.c.id == item.id).where(ctbl.c.transaction_id == item.transaction_id)) elif item.content == None and item.file == None: self.log.info("Deleting item without a file and no content for url: %s!", url) # print(type(item), item.mimetype, item.file, item.content) # print(ctbl.delete().where(ctbl.c.id == item.id).where(ctbl.c.transaction_id == item.transaction_id)) self.sess.execute(ctbl.delete().where(ctbl.c.id == item.id).where(ctbl.c.transaction_id == item.transaction_id)) deleted_1 += 1 elif item.content != None: # print(type(item), item.keys(), item.addtime, item.fetchtime) closest = min(datevec, key=self.diff_func(item)) if not closest in attachments: attachments[closest] = [] attachments[closest].append(item) else: print("Wat?") self.log.info("Found %s items missing both file reference and content", deleted_1) keys = list(attachments.keys()) keys.sort() out = [] for key in keys: superset = attachments[key] if len(superset) > 1: # print("lolercoaster") superset.sort(key=lambda x: (x.addtime if x.fetchtime is None else x.fetchtime, x.id, x.transaction_id), reverse=True) out.append(superset[0]) # print(superset[0].fetchtime, superset[0].id, superset[0].transaction_id) for tmp in superset[1:]: self.sess.execute(ctbl.delete().where(ctbl.c.id == tmp.id).where(ctbl.c.transaction_id == tmp.transaction_id)) deleted_2 += 1 elif len(superset) == 1: out.append(superset[0]) else: raise ValueError("Wat? Key with no items!") deleted = deleted_1 + deleted_2 seq_dirty = self.relink_row_sequence(out) if deleted > 0 or seq_dirty: # Rewrite the tid links so the history renders properly self.log.info("Committing because %s items were removed!", deleted) self.sess.commit() else: self.sess.rollback() self.log.info("Deleted: %s items when simplifying history, Total deleted: %s, remaining: %s", deleted_2, deleted, orig_cnt-deleted)
def truncate_url_history(self, sess, url): ctbl = version_table(db.WebPages) items = sess.query(ctbl) \ .filter(ctbl.c.url == url) \ .all() items.sort( key=lambda x: (x.id, x.transaction_id, x.end_transaction_id)) # for x in items: # print(x.id, x.transaction_id, x.end_transaction_id) deleted_1 = 0 deleted_2 = 0 orig_cnt = len(items) datevec = self.snap_times self.log.info("Clearing history for URL: %s (items: %s)", url, orig_cnt) attachments = {} for item in items: if item.state != "complete": deleted_1 += 1 self.log.info("Deleting incomplete item for url: %s!", url) sess.execute(ctbl.delete().where(ctbl.c.id == item.id).where( ctbl.c.transaction_id == item.transaction_id)) elif item.content == None and item.file == None: self.log.info( "Deleting item without a file and no content for url: %s!", url) # print(type(item), item.mimetype, item.file, item.content) # print(ctbl.delete().where(ctbl.c.id == item.id).where(ctbl.c.transaction_id == item.transaction_id)) sess.execute(ctbl.delete().where(ctbl.c.id == item.id).where( ctbl.c.transaction_id == item.transaction_id)) deleted_1 += 1 elif item.content != None: # print(type(item), item.keys(), item.addtime, item.fetchtime) closest = min(datevec, key=self.diff_func(item)) if not closest in attachments: attachments[closest] = [] attachments[closest].append(item) elif item.file != None: pass else: print("Wat?") self.log.info("Found %s items missing both file reference and content", deleted_1) keys = list(attachments.keys()) keys.sort() out = [] for key in keys: superset = attachments[key] if len(superset) > 1: # print("lolercoaster") superset.sort(key=lambda x: (x.addtime if x.fetchtime is None else x. fetchtime, x.id, x.transaction_id), reverse=True) out.append(superset[0]) # print(superset[0].fetchtime, superset[0].id, superset[0].transaction_id) for tmp in superset[1:]: sess.execute( ctbl.delete().where(ctbl.c.id == tmp.id).where( ctbl.c.transaction_id == tmp.transaction_id)) deleted_2 += 1 elif len(superset) == 1: out.append(superset[0]) else: raise ValueError("Wat? Key with no items!") deleted = deleted_1 + deleted_2 seq_dirty = self.relink_row_sequence(sess, out) if deleted > 0 or seq_dirty: # Rewrite the tid links so the history renders properly self.log.info("Committing because %s items were removed!", deleted) sess.commit() else: sess.rollback() self.log.info( "Deleted: %s items when simplifying history, Total deleted: %s, remaining: %s", deleted_2, deleted, orig_cnt - deleted)
def exposed_purge_invalid_url_history(): ''' Functionally identical to `purge_invalid_urls`, but operates on the history table only. This means that it will operate on row IDs that have been deleted from the main DB (intentionally or not) ''' with db.session_context() as sess: ctbl = version_table(db.WebPages) for ruleset in WebMirror.rules.load_rules(): if ruleset['netlocs'] and ruleset['badwords']: agg_bad = [tmp for tmp in ruleset['badwords']] agg_bad.extend(common.global_constants.GLOBAL_BAD_URLS) # So there's no way to escape a LIKE string in postgres..... search_strs = [ "%{}%".format( badword.replace(r"_", r"\_").replace(r"%", r"\%").replace( r"\\", r"\\")) for badword in agg_bad ] print("Badwords:") for bad in search_strs: print(" Bad: ", bad) print("Netlocs:") print(ruleset['netlocs']) # We have to delete from the normal table before the versioning table, # because deleting from the normal table causes inserts into the versioning table # due to the change tracking triggers. ands = [or_(*(ctbl.c.url.like(ss) for ss in search_strs))] print("Filtering by all netlocs in rule file.") ands.append(ctbl.c.netloc.in_(ruleset['netlocs'])) ids = sess.query(ctbl.c.id) \ .filter(and_(*ands)) \ .all() # Collapse duplicates ids = set(ids) if ids == 0: print( "{num} items match badwords from file {file}. No deletion required " .format(file=ruleset['filename'], num=len(ids))) else: print( "{num} items match badwords from file {file}. Deleting " .format(file=ruleset['filename'], num=len(ids))) # Returned list of IDs is each ID packed into a 1-tuple. Unwrap those tuples so it's just a list of integer IDs. ids = [tmp[0] for tmp in ids] delete_internal(sess, ids, ruleset['netlocs'], ruleset['badwords'])