예제 #1
0
def exposed_delete_spcnet_invalid_url_pages():
	'''
	So the spcnet.tv forum software generates THOUSANDS of garbage links somehow.
	Anyways, delete those.
	'''
	with db.session_context() as sess:
		tables = [
			db.WebPages.__table__,
			version_table(db.WebPages.__table__)
		]

		for ctbl in tables:
			# Print Querying for affected rows
			q = sess.query(ctbl.c.id) \
				.filter(ctbl.c.netloc == "www.spcnet.tv") \
				.filter(ctbl.c.content.like('%Invalid Forum specified. If you followed a valid link, please notify the%'))
			print("Query:")
			print(q)
			ids = q.all()

			ids = set(ids)

			# Returned list of IDs is each ID packed into a 1-tuple. Unwrap those tuples so it's just a list of integer IDs.
			ids = [tmp[0] for tmp in ids]

			print("Fount %s rows requring deletion. Deleting." % len(ids))
			delete_internal(sess, ids)
			sess.commit()
예제 #2
0
	def relink_row_sequence(self, sess, rows):
		'''
		Each Sqlalchemy-Continum transaction references the next transaction in the chain as it's `end_transaction_id`
		except the most recent (where the value of end_transaction_id is `None`)

		Therefore, we iterate over the history in reverse, and for each item set it's `end_transaction_id` to the
		id of the next transaction, so the history linked list works correctly.
		'''

		ctbl = version_table(db.WebPages.__table__)

		rows.sort(reverse=True, key=lambda x: (x.id, x.transaction_id, x.end_transaction_id))
		end_transaction_id = None
		dirty = False
		for x in rows:
			if x.end_transaction_id != end_transaction_id:
				self.log.info("Need to change end_transaction_id from %s to %s", x.end_transaction_id, end_transaction_id)

				update = ctbl.update().where(ctbl.c.id == x.id).where(ctbl.c.transaction_id == x.transaction_id).values(end_transaction_id=end_transaction_id)
				# print(update)
				sess.execute(update)

				dirty = True
			end_transaction_id = x.transaction_id

		return dirty
예제 #3
0
def exposed_purge_raw_invalid_urls_from_history():
	'''
	Delete all raw-archiver rows that aren't
	attached to a archiver module.
	'''

	sess1 = db.get_db_session(postfix='iter_sess')
	sess2 = db.get_db_session(postfix='delete_sess')

	ctbl = version_table(db.RawWebPages.__table__)

	print("Loading files from database...")
	# spinner1 = Spinner()

	est = sess1.execute("SELECT reltuples::BIGINT AS estimate FROM pg_class WHERE relname='raw_web_pages_version';")
	res = est.scalar()
	print("Estimated row-count: %s" % res)

	last_bad = ""
	deleted = 0
	total_rows = 0
	last_commit = 0
	maxlen = 0
	changed_rows = 0
	with tqdm.tqdm(total=res) as pbar:
		bad = 0

		for rurl, rnetloc in sess1.query(ctbl.c.url, ctbl.c.netloc).yield_per(1000):
			modules_wants_url = any([mod.cares_about_url(rurl) for mod in RawArchiver.RawActiveModules.ACTIVE_MODULES])
			has_badwords      = any([badword in rurl for badword in common.global_constants.GLOBAL_BAD_URLS])
			if not modules_wants_url or has_badwords:
				last_bad = rnetloc
				# print("Unwanted: ", rurl)

				changed_rows = sess2.query(ctbl) \
					.filter(ctbl.c.url == rurl) \
					.delete(synchronize_session=False)

				bad += 1
				deleted += 1
			total_rows += 1

			if bad > 5000:
				# print("Committing!")
				bad = 0
				last_commit = deleted
				sess2.commit()
				# pbar.set_description("Doing Commit", refresh=True)
			else:
				msg = "Deleted: %s, since commit: %s, last_bad: '%s' (%s, %s%%)" % \
					(deleted, deleted-last_commit, last_bad, changed_rows, 100.0*(deleted / total_rows))
				maxlen = max(len(msg), maxlen)
				pbar.set_description(msg.ljust(maxlen), refresh=False)


			pbar.update(n=1)

	sess1.commit()
	sess2.commit()
예제 #4
0
    def checkHaveHistory(self, sess, url):
        # Only do version fiddling if versioning is enabled.
        if not settings.DO_VERSIONING:
            return 99

        ctbl = version_table(self.db.RawWebPages.__table__)
        query = sess.query(exists().where(ctbl.c.url == url))
        return query.scalar()
예제 #5
0
def render():
    req_url = request.args.get('url')
    if not req_url:
        return build_error_response(message="Error! No page specified!")
    req_url = request.args.get('url')

    version = request.args.get('version')
    ignore_cache = request.args.get("nocache")

    filterstate = get_filter_state_for_url(req_url)

    try:
        if version == "None":
            version = None
        else:
            version = ast.literal_eval(version)
    except ValueError:
        return build_error_response(
            message="Error! Historical version number must be an integer!")

    if version and ignore_cache:
        return build_error_response(
            message="Error! Cannot render a historical version with nocache!")

    if version:
        rid, tid = version

        print("Historical row id: ", rid, tid)

        ctbl = version_table(db.WebPages.__table__)

        rows = g.session.query(ctbl.c.title, ctbl.c.content) \
         .filter(ctbl.c.id == rid)                        \
         .filter(ctbl.c.transaction_id == tid)            \
         .all()

        if rows:
            row = rows.pop()
            title, content = row
            content = utilities.replace_links(content)
            cachestate = "Historical version: %s" % (version, )
    else:
        title, content, cachestate = WebMirror.API.getPage(
            req_url, ignore_cache=ignore_cache, version=version)

    # print("Render-Version: ", version, type(version))
    # print("Rendering with nocache=", ignore_cache)
    # print("Return:", cachestate)
    response = jsonify(
        title=title,
        contents=content,
        cachestate=cachestate,
        filterstate=filterstate,
        req_url=req_url,
    )
    return set_cache_control_headers(response)
예제 #6
0
    def checkHaveHistory(self, url):
        # Only do version fiddling if versioning is enabled.
        if not settings.DO_VERSIONING:
            return 99

        ctbl = version_table(db.WebPages.__table__)

        count = self.db_sess.query(ctbl) \
         .filter(ctbl.c.url == url)   \
         .count()
        return count
예제 #7
0
def exposed_delete_gravitytales_bot_blocked_pages():
    '''
	Delete the "checking you're not a bot" garbage pages
	that sometimes get through the gravitytales scraper.
	'''
    with db.session_context() as sess:
        tables = [db.WebPages.__table__, version_table(db.WebPages.__table__)]

        for ctbl in tables:
            update = ctbl.delete() \
             .where(ctbl.c.netloc == "gravitytales.com") \
             .where(ctbl.c.content.like('%<div id="bot-alert" class="alert alert-info">%'))
            print(update)
            sess.execute(update)
            sess.commit()
예제 #8
0
def delete_internal(sess, ids):

	if ids:
		print("Doint delete. %s rows requiring update." % (len(ids), ))
	else:
		print("No rows needing deletion.")
		return

	ctbl = version_table(db.WebPages.__table__)
	chunk_size = 5000
	for chunk_idx in range(0, len(ids), chunk_size):
		chunk = ids[chunk_idx:chunk_idx+chunk_size]
		while 1:
			try:

				# Allow ids that only exist in the history table by falling back to a
				# history-table query if the main table doesn't have the ID.
				try:
					ex = sess.query(db.WebPages.url).filter(db.WebPages.id == chunk[0]).one()[0]
				except sqlalchemy.orm.exc.NoResultFound:
					ex = sess.query(ctbl.c.url).filter(ctbl.c.id == chunk[0]).all()[0][0]

				print("Example removed URL: '%s'" % (ex))


				q1 = sess.query(db.WebPages).filter(db.WebPages.id.in_(chunk))
				affected_rows_main = q1.delete(synchronize_session=False)

				q2 = sess.query(ctbl).filter(ctbl.c.id.in_(chunk))
				affected_rows_ver = q2.delete(synchronize_session=False)

				sess.commit()
				print("Deleted %s rows (%s version table rows). %0.2f%% done." %
						(affected_rows_main, affected_rows_ver,  100 * ((chunk_idx) / len(ids))))
				break
			except sqlalchemy.exc.InternalError:
				print("Transaction error (sqlalchemy.exc.InternalError). Retrying.")
				sess.rollback()
			except sqlalchemy.exc.OperationalError:
				print("Transaction error (sqlalchemy.exc.OperationalError). Retrying.")
				sess.rollback()
			except sqlalchemy.exc.IntegrityError:
				print("Transaction error (sqlalchemy.exc.IntegrityError). Retrying.")
				sess.rollback()
			except sqlalchemy.exc.InvalidRequestError:
				print("Transaction error (sqlalchemy.exc.InvalidRequestError). Retrying.")
				traceback.print_exc()
				sess.rollback()
예제 #9
0
def view_history():
    req_url = request.args.get('url')
    if not req_url:
        return render_template('error.html',
                               title='Viewer',
                               message="Error! No page specified!")

    version = request.args.get('version')
    delete_id = request.args.get("delete_id")
    delete = request.args.get("delete")

    print(version, delete_id, delete)

    if version and not (delete_id or delete):
        return render_template('view.html',
                               title='Rendering Content',
                               req_url=req_url,
                               version=version)

    versions = []

    ctbl = version_table(db.WebPages.__table__)


    versions = g.session.query(ctbl.c.id, ctbl.c.state, ctbl.c.addtime, ctbl.c.fetchtime, ctbl.c.transaction_id) \
     .filter(ctbl.c.url == req_url)                                    \
     .order_by(ctbl.c.fetchtime)                                       \
     .all()

    versions = list(enumerate(versions))
    versions.reverse()

    if delete_id and delete:
        return do_history_delete(versions, version, delete_id, delete)

    return render_template('history.html',
                           title='Item History',
                           req_url=req_url,
                           versions=versions)
예제 #10
0
	def truncate_url_history(self, sess, url):

		# last_check = db.get_from_version_check_table(sess, url)
		# if last_check > datetime.datetime.now() - FLATTEN_SCAN_INTERVAL:
		# 	self.log.info("Url %s checked within the check interval (%s, %s). Skipping.", url, FLATTEN_SCAN_INTERVAL, last_check)
		# 	return 0
		# else:
		# 	self.log.info("Url %s last checked %s.", url, last_check)


		ctbl = version_table(db.WebPages.__table__)

		if url in self.feed_urls:
			self.log.info("Feed URL (%s)! Deleting history wholesale!", url)

			# res = sess.execute(
			# 		ctbl.delete() \
			# 		.where(ctbl.c.url == url)
			# 	)
			# self.log.info("Modified %s rows", res.rowcount)
			# sess.commit()
			# self.log.info("Committed. Setting version log.")
			# db.set_in_version_check_table(sess, url, datetime.datetime.now())
			# new_val = db.get_from_version_check_table(sess, url)
			# self.log.info("New value from DB: %s", new_val)

			return



		self.log.info("Counting rows for url %s.", url)
		orig_cnt = sess.query(ctbl)           \
			.filter(ctbl.c.url == url)     \
			.count()

		self.log.info("Found %s results for url %s. Fetching rows", orig_cnt, url)

		deleted_1 = 0
		deleted_2 = 0

		datevec = self.snap_times
		attachments = {}

		for item in tqdm.tqdm(
			sess.query(ctbl)                               \
			.filter(ctbl.c.url == url)                     \
			.order_by(ctbl.c.id, ctbl.c.transaction_id)    \
			.yield_per(50), total=orig_cnt):

			if item.state != "complete" and item.state != 'error':
				deleted_1 += 1
				self.log.info("Deleting incomplete item for url: %s (state: %s)!", url, item.state)
				sess.execute(ctbl.delete().where(ctbl.c.id == item.id).where(ctbl.c.transaction_id == item.transaction_id))
			elif item.content is None and item.file is None:
				self.log.info("Deleting item without a file and no content for url: %s!", url)
				# print(type(item), item.mimetype, item.file, item.content)
				# print(ctbl.delete().where(ctbl.c.id == item.id).where(ctbl.c.transaction_id == item.transaction_id))
				sess.execute(ctbl.delete().where(ctbl.c.id == item.id).where(ctbl.c.transaction_id == item.transaction_id))
				deleted_1 += 1
			elif item.content != None:
				closest = min(datevec, key=self.diff_func(item))
				if not closest in attachments:
					attachments[closest] = []

				attachments[closest].append({
						'addtime'        : item.addtime,
						'fetchtime'      : item.fetchtime,
						'id'             : item.id,
						'transaction_id' : item.transaction_id,
					})

			elif item.file != None:
				pass
			else:
				print("Wat?")



		self.log.info("Found %s items missing both file reference and content", deleted_1)
		keys = list(attachments.keys())
		keys.sort()

		out = []

		for key in tqdm.tqdm(keys):
			superset = attachments[key]
			if len(superset) > 1:
				# print("lolercoaster")
				superset.sort(key=lambda x: (x['addtime'] if x['fetchtime'] is None else x['fetchtime'], x['id'], x['transaction_id']), reverse=True)
				out.append(superset[0])
				# print(superset[0].fetchtime, superset[0].id, superset[0].transaction_id)
				self.log.info("Deleting %s items (out of %s) from date-segment %s", len(superset)-1, len(superset), key)
				for tmp in superset[1:]:
					sess.execute(ctbl.delete().where(ctbl.c.id == tmp['id']).where(ctbl.c.transaction_id == tmp['transaction_id']))
					deleted_2 += 1
			elif len(superset) == 1:
				out.append(superset[0])
			else:
				raise ValueError("Wat? Key with no items!")

		deleted = deleted_1 + deleted_2
		# seq_dirty = self.relink_row_sequence(sess, out)
		# if deleted > 0 or seq_dirty:
		if deleted > 0:
			# Rewrite the tid links so the history renders properly
			# self.log.info("Committing because %s items were removed!", deleted)
			sess.commit()
		else:
			sess.rollback()

		db.set_in_version_check_table(sess, url, datetime.datetime.now())

		self.log.info("Deleted: %s items when simplifying history, %s incomplete items, Total deleted: %s, remaining: %s", deleted_2, deleted_1, deleted, orig_cnt-deleted)
		return deleted
예제 #11
0
    def checkHaveHistory(self, url):
        ctbl = version_table(self.db.RawWebPages.__table__)

        (count, ), = self.db_sess.query(exists().where(ctbl.c.url == url))

        return count
예제 #12
0
def sync_filtered_with_filesystem():
    with db.session_context() as sess:
        tgtpath = settings.RESOURCE_DIR
        ctbl = version_table(db.RawWebPages.__table__)

        print("Loading files from database...")
        # spinner1 = Spinner()

        est = sess.execute(
            "SELECT reltuples::BIGINT AS estimate FROM pg_class WHERE relname='raw_web_pages';"
        )
        res = est.scalar()

        vest = sess.execute(
            "SELECT reltuples::BIGINT AS estimate FROM pg_class WHERE relname='raw_web_pages_version';"
        )
        vres = vest.scalar()

        print("Estimated row-count: %s, version table: %s" % (res, vres))

        in_main_db = []
        with tqdm(total=res) as pbar:
            for row in sess.query(db.WebFiles).yield_per(10000):
                if row.fspath:
                    in_main_db.append(row.fspath)
                    pbar.update(n=1)

        in_history_db = []

        with tqdm(total=vres) as pbar:
            for rfspath, in sess.query(ctbl.c.fspath).yield_per(1000):
                if rfspath:
                    in_history_db.append(rfspath)
                    pbar.update(n=1)

        origl_main = len(in_main_db)
        origl_hist = len(in_history_db)
        in_db_main = set(in_main_db)
        in_db_hist = set(in_history_db)

        in_db = in_db_main + in_db_hist

        print("")
        print("%s files, %s unique" % ((origl_main, origl_hist),
                                       (len(in_db_main), len(in_db_hist))))
        print("Enumerating files from disk...")
        agg_files = []
        have_files = []
        # spinner2 = Spinner()

        with tqdm(total=len(in_db)) as pbar:
            for root, _, files in os.walk(tgtpath):
                for filen in files:
                    fqpath = os.path.join(root, filen)
                    fpath = fqpath[len(tgtpath) + 1:]

                    if fpath in in_db:
                        pbar.update(n=1)
                        # spinner2.next(star=True, vlen=0)
                        have_files.append(fpath)
                    else:
                        pbar.update(n=1)
                        # spinner2.next(vlen=1)
                        agg_files.append(fpath)
                        fqpath = os.path.join(tgtpath, fpath)
                        # os.unlink(fqpath)
                        print("\rDeleting: %s  " % fqpath)
예제 #13
0
def do_history_delete(versions, version, delete_id, delete):

    ctbl = version_table(db.WebPages.__table__)

    if delete != "True":
        return render_template('error.html',
                               title='Error when deleting!',
                               message="Delete param not true?")
    try:
        version = int(version)
    except ValueError:
        return render_template(
            'error.html',
            title='Error when deleting!',
            message="Cannot convert version value to integer!")
    try:
        delete_id = int(delete_id)
    except ValueError:
        return render_template(
            'error.html',
            title='Error when deleting!',
            message="Cannot convert delete_id value to integer!")

    versions = dict(versions)

    if delete_id == -1 and version == -1:
        maxid = max(versions.keys())

        for vid, version in versions.items():
            if vid != maxid:
                rid, tid = version.id, version.transaction_id
                print("Deleting:", version, rid, tid)
                g.session.query(ctbl)                     \
                 .filter(ctbl.c.id             == rid) \
                 .filter(ctbl.c.transaction_id == tid) \
                 .delete(synchronize_session=False)
        g.session.commit()
        return render_template('error.html',
                               title='All old versions deleted',
                               message="All old versions deleted")

    else:
        if not version in versions:
            return render_template(
                'error.html',
                title='Error when deleting!',
                message="Version value doesn't exist? ('%s', '%s')" %
                (version, type(version)))
        target = versions[version]
        if not target.id == delete_id:
            return render_template(
                'error.html',
                title='Error when deleting!',
                message="Delete row PK Id doesn't match specified delete ID?")

        print("Deleting:", target)
        g.session.query(ctbl)                                       \
         .filter(ctbl.c.id == target.id)                         \
         .filter(ctbl.c.transaction_id == target.transaction_id) \
         .delete(synchronize_session=False)
        g.session.commit()

        return render_template('error.html',
                               title='Row deleted',
                               message="Row: '%s', '%s'" %
                               (delete_id, version))
예제 #14
0
def view_rendered():
    req_url = request.args.get('url')
    if not req_url:
        return render_template('error.html',
                               title='Error',
                               message="Error! No page specified!")

    req_url = request.args.get('url')
    version = request.args.get('version')
    ignore_cache = request.args.get("nocache")

    filterstate = get_filter_state_for_url(req_url)

    try:
        if version == "None" or version is None:
            version = None
        else:
            version = ast.literal_eval(version)
    except ValueError:
        traceback.print_exc()
        return render_template(
            'error.html',
            title='Error',
            message="Error! Historical version number must be an integer!")

    if version and ignore_cache:
        return render_template(
            'error.html',
            title='Error',
            message="Error! Cannot render a historical version with nocache!")

    if version:
        rid, tid = version

        print("Historical row id: ", rid, tid)

        ctbl = version_table(db.WebPages.__table__)

        rows = g.session.query(ctbl.c.title, ctbl.c.content) \
         .filter(ctbl.c.id == rid)                        \
         .filter(ctbl.c.transaction_id == tid)            \
         .all()

        if rows:
            row = rows.pop()
            title, content = row
            content = utilities.replace_links(content)
            cachestate = "Historical version: %s" % (version, )
    else:
        title, content, cachestate = WebMirror.API.getPage(
            req_url, ignore_cache=ignore_cache, version=version)

    response = make_response(
        render_template(
            'view-rendered.html',
            req_url=req_url,
            version=version,
            title=title,
            contents=content,
            cachestate=cachestate,
            filterstate=filterstate,
        ))
    return set_cache_control_headers(response, allow_inline=True)