Пример #1
0
def db_fiddle():
	print("Fixing DB things.")
	print("Getting IDs")
	have = db.get_db_session().execute("""
		SELECT id FROM web_pages WHERE url LIKE 'https://www.wattpad.com/story%' AND state != 'new';
		""")
	print("Query executed. Fetching results")
	have = list(have)
	print(len(have))
	count = 0

	chunk = []
	for item, in have:
		chunk.append(item)

		count += 1
		if count % 1000 == 0:
			statement = db.get_db_session().query(db.WebPages) \
				.filter(db.WebPages.state != 'new')        \
				.filter(db.WebPages.id.in_(chunk))

			# statement = db.get_db_session().update(db.WebPages)
			statement.update({db.WebPages.state : 'new'}, synchronize_session=False)
			chunk = []
			print(count, item)
			db.get_db_session().commit()
Пример #2
0
def clear_bad():
	from sqlalchemy.dialects import postgresql

	rules = WebMirror.rules.load_rules()

	for ruleset in rules:

		print("Cleaning ruleset")
		# print(ruleset['netlocs'])
		# print(ruleset.keys())
		for badword in ruleset['badwords']:
			if not ruleset['netlocs']:
				continue
			if "%" in badword:
				print(badword)
			else:
				print("Deleting items containing string: '%s'" % badword)
				q = db.get_db_session().query(db.WebPages)                   \
					.filter(db.WebPages.netloc.in_(ruleset['netlocs']))   \
					.filter(db.WebPages.url.like("%{}%".format(badword)))
				items = q.count()
				if items:
					print("%s results for : '%s'" % (items, badword))

					q = db.get_db_session().query(db.WebPages)                   \
						.filter(db.WebPages.netloc.in_(ruleset['netlocs']))   \
						.filter(db.WebPages.url.like("%{}%".format(badword))) \
						.delete(synchronize_session=False)
					db.get_db_session().commit()
Пример #3
0
def update_feed_names():
	for key, value in feedNameLut.mapper.items():
		feed_items = db.get_db_session().query(db.FeedItems) \
				.filter(db.FeedItems.srcname == key)    \
				.all()
		if feed_items:
			for item in feed_items:
				item.srcname = value
			print(len(feed_items))
			print(key, value)
			db.get_db_session().commit()
Пример #4
0
def initializeStartUrls(rules):
    print("Initializing all start URLs in the database")
    sess = db.get_db_session()
    for ruleset in [rset for rset in rules if rset['starturls']]:
        for starturl in ruleset['starturls']:
            have = sess.query(db.WebPages) \
             .filter(db.WebPages.url == starturl)   \
             .count()
            if not have:
                netloc = urlFuncs.getNetLoc(starturl)
                new = db.WebPages(
                    url=starturl,
                    starturl=starturl,
                    netloc=netloc,
                    type=ruleset['type'],
                    priority=db.DB_IDLE_PRIORITY,
                    distance=db.DB_DEFAULT_DIST,
                    normal_fetch_mode=ruleset['normal_fetch_mode'],
                )
                print("Missing start-url for address: '{}'".format(starturl))
                sess.add(new)
            try:
                sess.commit()
            except sqlalchemy.SQLAlchemyError:
                print("Failure inserting start url for address: '{}'".format(
                    starturl))

                sess.rollback()
    sess.close()
    db.delete_db_session()
Пример #5
0
def longest_rows():
	print("Getting longest rows from database")
	have = db.get_db_session().execute("""
		SELECT
			id, url, length(content), content
		FROM
			web_pages
		ORDER BY
			LENGTH(content) DESC NULLS LAST
		LIMIT 50;
		""")
	print("Rows:")

	import os
	import os.path

	savepath = "./large_files/"
	for row in have:
		print(row[0], row[1])
		try:
			os.makedirs(savepath)
		except FileExistsError:
			pass
		with open(os.path.join(savepath, "file %s.txt" % row[0]), "wb") as fp:
			urlst = "URL: %s\n\n" % row[1]
			size = "Length: %s\n\n" % row[2]
			fp.write(urlst.encode("utf-8"))
			fp.write(size.encode("utf-8"))
			fp.write("{}".format(row[3]).encode("utf-8"))
Пример #6
0
def delete_comment_feed_items():

	sess = db.get_db_session()
	bad = sess.query(db.FeedItems) \
			.filter(or_(
				db.FeedItems.contenturl.like("%#comment-%"),
				db.FeedItems.contenturl.like("%CommentsForInMyDaydreams%"),
				db.FeedItems.contenturl.like("%www.fanfiction.net%"),
				db.FeedItems.contenturl.like("%www.fictionpress.com%"),
				db.FeedItems.contenturl.like("%www.booksie.com%")))    \
			.order_by(db.FeedItems.contenturl) \
			.all()

	count = 0
	for bad in bad:
		print(bad.contenturl)

		while bad.author:
			bad.author.pop()
		while bad.tags:
			bad.tags.pop()
		sess.delete(bad)
		count += 1
		if count % 1000 == 0:
			print("Committing at %s" % count)
			sess.commit()

	print("Done. Committing...")
	sess.commit()
Пример #7
0
    def doCall(self):

        self.log.info("Calling job %s", self.job_name)
        session = db.get_db_session()
        item = session.query(db.PluginStatus).filter(
            db.PluginStatus.plugin_name == self.job_name).one()
        if item.is_running:
            session.commit()
            self.log.error(
                "Plugin %s is already running! Not doing re-entrant call!",
                self.job_name)
            return

        item.is_running = True
        item.last_run = datetime.datetime.now()
        session.commit()

        try:
            self._doCall()
        except Exception:
            item.last_error = datetime.datetime.now()
            item.last_error_msg = traceback.format_exc()
            raise
        finally:

            item2 = session.query(db.PluginStatus).filter(
                db.PluginStatus.plugin_name == self.job_name).one()
            item2.is_running = False
            item2.last_run_end = datetime.datetime.now()
            session.commit()
            db.delete_db_session()
        self.log.info("Job %s complete.", self.job_name)
Пример #8
0
    def __init__(self, job_name):

        if not job_name in CALLABLE_LUT:
            raise JobNameException(
                "Callable '%s' is not in the class lookup table: '%s'!" %
                (job_name, CALLABLE_LUT))
        self.runModule = CALLABLE_LUT[job_name]
        self.job_name = job_name

        session = db.get_db_session()

        try:
            query = session.query(db.PluginStatus).filter(
                db.PluginStatus.plugin_name == job_name)
            have = query.scalar()
            if not have:
                new = db.PluginStatus(plugin_name=job_name)
                session.add(new)
                session.commit()
        except sqlalchemy.exc.OperationalError:
            session.rollback()
        except sqlalchemy.exc.InvalidRequestError:
            session.rollback()

        finally:
            db.delete_db_session()
Пример #9
0
def resetRunStates():
    print("JobSetup call resetting run-states!")
    session = db.get_db_session()
    session.query(db.PluginStatus).update({db.PluginStatus.is_running: False})
    session.commit()
    db.delete_db_session()
    print("Run-states reset.")
Пример #10
0
def test_retrieve(url, debug=True, rss_debug=False):

	# try:
	# 	WebMirror.SpecialCase.startAmqpFetcher()
	# except RuntimeError:  # Fetcher already started
	# 	pass

	if rss_debug:
		print("Debugging RSS")
		flags.RSS_DEBUG = True

	parsed = urllib.parse.urlparse(url)
	root = urllib.parse.urlunparse((parsed[0], parsed[1], "", "", "", ""))

	new = db.WebPages(
		url       = url,
		starturl  = root,
		netloc    = parsed.netloc,
		distance  = 50000,
		is_text   = True,
		priority  = 500000,
		type      = 'unknown',
		fetchtime = datetime.datetime.now(),
		)

	if debug:
		print(new)

	try:
		archiver = SiteArchiver(None, db.get_db_session(), None)
		job     = archiver.synchronousJobRequest(url, ignore_cache=True)
	except Exception as e:
		traceback.print_exc()
	finally:
		db.delete_db_session()
Пример #11
0
	def __init__(self, connect=True):

		input_settings = {
			'RABBIT_LOGIN'      : settings.NU_RABBIT_LOGIN,
			'RABBIT_PASWD'      : settings.NU_RABBIT_PASWD,
			'RABBIT_SRVER'      : settings.NU_RABBIT_SRVER,
			'RABBIT_VHOST'      : settings.NU_RABBIT_VHOST,
			'synchronous'       : False,
			'prefetch'          : 1,
			'master'            : True,
			'taskq_task'        : 'nuresponse.master.q',
			'taskq_response'    : 'nureleases.master.q',
			'poll_rate'         : 1.0 / 25,
		}
		if connect:
			self.data_in = WebMirror.OutputFilters.AmqpInterface.RabbitQueueHandler(input_settings)

		# output_settings = {
		# 	'RABBIT_LOGIN'      : settings.RABBIT_LOGIN,
		# 	'RABBIT_PASWD'      : settings.RABBIT_PASWD,
		# 	'RABBIT_SRVER'      : settings.RABBIT_SRVER,
		# 	'RABBIT_VHOST'      : settings.RABBIT_VHOST,
		# 	'taskq_task'     : 'task.master.q',
		# 	'taskq_response' : 'response.master.q',
		# }

		self.name_lut, self.group_lut = load_lut()

		super().__init__(db_sess = db.get_db_session(postfix='nu_forwarder'), connect=connect)
Пример #12
0
def fix_null():
	step = 50000


	end = db.get_db_session().execute("""SELECT MAX(id) FROM web_pages WHERE  ignoreuntiltime IS NULL;""")
	end = list(end)[0][0]

	start = db.get_db_session().execute("""SELECT MIN(id) FROM web_pages WHERE ignoreuntiltime IS NULL;""")
	start = list(start)[0][0]

	changed = 0

	if not start:
		print("No null rows to fix!")
		return

	start = start - (start % step)

	for x in range(start, end, step):
		# SQL String munging! I'm a bad person!
		# Only done because I can't easily find how to make sqlalchemy
		# bind parameters ignore the postgres specific cast
		# The id range forces the query planner to use a much smarter approach which is much more performant for small numbers of updates
		have = db.get_db_session().execute("""UPDATE web_pages SET ignoreuntiltime = 'epoch'::timestamp WHERE ignoreuntiltime IS NULL AND id < %s AND id >= %s;""" % (x, x-step))
		# print()
		print('%10i, %7.4f, %6i' % (x, x/end * 100, have.rowcount))
		changed += have.rowcount
		if changed > 10000:
			print("Committing (%s changed rows)...." % changed, end=' ')
			db.get_db_session().commit()
			print("done")
			changed = 0
	db.get_db_session().commit()
Пример #13
0
def clear_blocked():
	for ruleset in WebMirror.rules.load_rules():
		if ruleset['netlocs'] and ruleset['badwords']:
			# mask = [db.WebPages.url.like("%{}%".format(tmp)) for tmp in ruleset['badwords'] if not "%" in tmp]

			for badword in ruleset['badwords']:
				feed_items = db.get_db_session().query(db.WebPages)          \
					.filter(db.WebPages.netloc.in_(ruleset['netlocs']))   \
					.filter(db.WebPages.url.like("%{}%".format(badword))) \
					.count()

				if feed_items:
					print("Have:  ", feed_items, badword)
					feed_items = db.get_db_session().query(db.WebPages)          \
						.filter(db.WebPages.netloc.in_(ruleset['netlocs']))   \
						.filter(db.WebPages.url.like("%{}%".format(badword))) \
						.delete(synchronize_session=False)
					db.get_db_session().expire_all()

				else:
					print("Empty: ", feed_items, badword)
Пример #14
0
def fix_tsv():
	step = 1000


	print("Determining extents that need to be changed.")
	start = db.get_db_session().execute("""SELECT MIN(id) FROM web_pages WHERE tsv_content IS NULL AND content IS NOT NULL AND id != 60903982;""")
	start = list(start)[0][0]

	end = db.get_db_session().execute("""SELECT MAX(id) FROM web_pages WHERE tsv_content IS NULL AND content IS NOT NULL;""")
	end = list(end)[0][0]

	changed = 0
	print("Start: ", start)
	print("End: ", end)


	if not start:
		print("No null rows to fix!")
		return

	start = start - (start % step)

	for x in range(start, end, step):
		try:
			# SQL String munging! I'm a bad person!
			# Only done because I can't easily find how to make sqlalchemy
			# bind parameters ignore the postgres specific cast
			# The id range forces the query planner to use a much smarter approach which is much more performant for small numbers of updates
			have = db.get_db_session().execute("""UPDATE web_pages SET tsv_content = to_tsvector(coalesce(content)) WHERE tsv_content IS NULL AND content IS NOT NULL AND id < %s AND id >= %s;""" % (x, x-step))
			# print()
			print('%10i, %10i, %7.4f, %6i' % (x, end, (x-start)/(end-start) * 100, have.rowcount))
			changed += have.rowcount
			if changed > step:
				print("Committing (%s changed rows)...." % changed, end=' ')
				db.get_db_session().commit()
				print("done")
				changed = 0
		except sqlalchemy.exc.OperationalError:
			db.get_db_session().rollback()
			print("Error!")
			traceback.print_exc()

	db.get_db_session().commit()
Пример #15
0
def resetInProgress():
    print("Resetting any stalled downloads from the previous session.")

    sess = db.get_db_session()
    sess.query(db.WebPages) \
     .filter(
       (db.WebPages.state == "fetching")           |
       (db.WebPages.state == "processing")         |
       (db.WebPages.state == "specialty_deferred") |
       (db.WebPages.state == "specialty_ready")
       )   \
     .update({db.WebPages.state : "new"})
    sess.commit()
    sess.close()
    db.delete_db_session()
Пример #16
0
    def get_times(self):
        conn = database.get_db_session()
        aps = conn.execute("SELECT job_state FROM apscheduler_jobs;")

        update_times = []
        for blob, in aps:
            job_dict = pickle.loads(blob)
            update_times.append(
                (job_dict['id'], job_dict['next_run_time'].isoformat()))

        data = {
            "update-times": update_times,
        }
        database.delete_db_session()

        return pack_message("system-update-times", data)
Пример #17
0
	def clean_files(self):

		session = db.get_db_session()
		q = session.query(db.WebFiles) \
			.filter(db.WebFiles.fspath != None)

		self.log.info("Querying for non-null filepaths...")
		have = q.all()
		self.log.info("Have %s local files.", len(have))
		count = 0
		for file in have:
			fpath = os.path.join(settings.RESOURCE_DIR, file.fspath)
			if not os.path.exists(fpath):
				self.log.error("Missing file: %s", fpath)

			count += 1
			if count % 1000 == 0:
				self.log.info("Scanned %s files.", count)
Пример #18
0
def consolidate_history():

	sess = db.get_db_session()
	print("Doing select")
	end = sess.execute("""
			SELECT
				count(*), url
			FROM
				web_pages_version
			GROUP BY
				url
			HAVING
				COUNT(*) > 1
			LIMIT
				40
		""")
	print("Wut?")
	print(end)
Пример #19
0
    def do_task(self):

        db_handle = db.get_db_session()

        hadjob = False
        try:
            self.archiver = WebMirror.Engine.SiteArchiver(
                self.cookie_lock,
                new_job_queue=self.new_job_queue,
                response_queue=self.resp_queue,
                db_interface=db_handle)
            hadjob = self.archiver.taskProcess()
        finally:
            # Clear out the sqlalchemy state
            db_handle.expunge_all()
            db.delete_db_session()

        return hadjob
Пример #20
0
def delete_feed(feed_name, do_delete, search_str):

	sess = db.get_db_session()
	items = sess.query(db.FeedItems)               \
		.filter(db.FeedItems.srcname == feed_name) \
		.all()

	do_delete = "true" in do_delete.lower()

	searchitems = search_str.split("|")
	for item in items:
		itemall = " ".join([item.title] + item.tags)
		if all([searchstr in itemall for searchstr in searchitems]):
			print(itemall)
			if do_delete:
				print("Deleting item")
				sess.delete(item)

	sess.commit()
def load_items():
    feed_items = db.get_db_session().query(db.FeedItems) \
      .order_by(db.FeedItems.srcname)           \
      .order_by(db.FeedItems.title)           \
      .all()

    with open("tests/title_test_data_two.py", 'w') as fp:
        fp.write("data = [\n")
        for row in feed_items:
            title = row.title

            try:
                p = TP(title)
                fp.write(
                    format_row(title, p.getVolume(), p.getChapter(),
                               p.getFragment(), p.getPostfix()))
            except ValueError:
                fp.write(format_row(title, 0, 0, 0, ''))

        fp.write("]\n")
Пример #22
0
def rss_db_sync(target = None, days=False, silent=False):

	json_file = 'rss_filter_misses-1.json'

	write_debug = True
	if silent:
		config.C_DO_RABBIT = False
	if target:
		config.C_DO_RABBIT = False
		flags.RSS_DEBUG    = True
		write_debug = False
	else:
		try:
			os.unlink(json_file)
		except FileNotFoundError:
			pass

	import WebMirror.processor.RssProcessor
	parser = WebMirror.processor.RssProcessor.RssProcessor(loggerPath   = "Main.RssDb",
															pageUrl     = 'http://www.example.org',
															pgContent   = '',
															type        = 'application/atom+xml',
															transfer    = False,
															debug_print = True,
															db_sess = None,
															write_debug = write_debug)


	print("Getting feed items....")

	if target:
		print("Limiting to '%s' source." % target)
		feed_items = db.get_db_session().query(db.FeedItems) \
				.filter(db.FeedItems.srcname == target)    \
				.order_by(db.FeedItems.srcname)           \
				.order_by(db.FeedItems.title)           \
				.all()
	elif days:
		print("RSS age override: ", days)
		cutoff = datetime.datetime.now() - datetime.timedelta(days=days)
		feed_items = db.get_db_session().query(db.FeedItems) \
				.filter(db.FeedItems.published > cutoff)  \
				.order_by(db.FeedItems.srcname)           \
				.order_by(db.FeedItems.title)             \
				.all()
	else:
		feed_items = db.get_db_session().query(db.FeedItems) \
				.order_by(db.FeedItems.srcname)           \
				.order_by(db.FeedItems.title)           \
				.all()


	print("Feed items: ", len(feed_items))

	for item in feed_items:
		ctnt = {}
		ctnt['srcname']   = item.srcname
		ctnt['title']     = item.title
		ctnt['tags']      = item.tags
		ctnt['linkUrl']   = item.contenturl
		ctnt['guid']      = item.contentid
		ctnt['published'] = calendar.timegm(item.published.timetuple())

		# Pop()ed off in processFeedData().
		ctnt['contents']  = 'wat'

		try:
			parser.processFeedData(ctnt, tx_raw=False, tx_parse=not bool(days))
		except ValueError:
			pass
		# print(ctnt)
	if target == None:
		sort_json(json_file)
Пример #23
0
 def launch_agg(cls, agg_queue):
     install_pystuck()
     agg_db = db.get_db_session()
     instance = cls(agg_queue, agg_db)
     instance.run()
     instance.close()
Пример #24
0
def purge_invalid_urls(selected_netloc=None):


	sess = db.get_db_session()
	for ruleset in WebMirror.rules.load_rules():

		if      (
						(ruleset['netlocs'] and ruleset['badwords'])
					and
					(
						(ruleset['netlocs'] and ruleset['badwords'] and selected_netloc is None)
						or
						(selected_netloc != None and selected_netloc in ruleset['netlocs'])
					)
				):
			# We have to delete from the normal table before the versioning table,
			# because deleting from the normal table causes inserts into the versioning table
			# due to the change tracking triggers.

			count = 1

			loc = and_(
					db.WebPages.netloc.in_(ruleset['netlocs']),
					or_(*(db.WebPages.url.like("%{}%".format(badword)) for badword in ruleset['badwords']))
				)
			# print("Doing count on table ")
			# count = sess.query(db.WebPages) \
			# 	.filter(or_(*opts)) \
			# 	.count()


			if count == 0:
				print("{num} items match badwords from file {file}. No deletion required ".format(file=ruleset['filename'], num=count))
			else:
				print("{num} items match badwords from file {file}. Deleting ".format(file=ruleset['filename'], num=count))

				sess.query(db.WebPages) \
					.filter(or_(*loc)) \
					.delete(synchronize_session=False)


			# # Do the delete from the versioning table now.
			ctbl = version_table(db.WebPages)
			loc2 = and_(
					ctbl.c.netloc.in_(ruleset['netlocs']),
					or_(*(ctbl.c.url.like("%{}%".format(badword)) for badword in ruleset['badwords']))
				)
			# print("Doing count on Versioning table ")
			# count = sess.query(ctbl) \
			# 	.filter(or_(*opts)) \
			# 	.count()

			if count == 0:
				print("{num} items in versioning table match badwords from file {file}. No deletion required ".format(file=ruleset['filename'], num=count))
			else:
				print("{num} items in versioning table match badwords from file {file}. Deleting ".format(file=ruleset['filename'], num=count))

				sess.query(ctbl) \
					.filter(or_(*loc2)) \
					.delete(synchronize_session=False)



			sess.commit()
Пример #25
0
def disable_wattpad():
	step = 50000


	print("Determining extents that need to be changed.")
	start = db.get_db_session().execute("""
		SELECT
			MIN(id)
		FROM
			web_pages
		WHERE
			(netloc = 'www.wattpad.com' OR netloc = 'a.wattpad.com')
		;""")

	start = list(start)[0][0]
	print("Start: ", start)

	end = db.get_db_session().execute("""
		SELECT
			MAX(id)
		FROM
			web_pages
		WHERE
			(netloc = 'www.wattpad.com' OR netloc = 'a.wattpad.com')
		;""")
	end = list(end)[0][0]

	changed = 0
	print("End: ", end)


	if not start:
		print("No null rows to fix!")
		return

	start = start - (start % step)

	for x in range(start, end, step):
		try:
			# SQL String munging! I'm a bad person!
			# Only done because I can't easily find how to make sqlalchemy
			# bind parameters ignore the postgres specific cast
			# The id range forces the query planner to use a much smarter approach which is much more performant for small numbers of updates
			have = db.get_db_session().execute("""
				UPDATE
					web_pages
				SET
					state = 'disabled'
				WHERE
						(netloc = 'www.wattpad.com' OR netloc = 'a.wattpad.com')
					AND
						(state = 'new' OR state = 'fetching' OR state = 'processing')
					AND
						id < %s
					AND
						id >= %s;""" % (x, x-step))
			# print()
			print('%10i, %10i, %10i, %7.4f, %6i' % (start, x, end, (x-start)/(end-start) * 100, have.rowcount))
			changed += have.rowcount
			if changed > step / 2:
				print("Committing (%s changed rows)...." % changed, end=' ')
				db.get_db_session().commit()
				print("done")
				changed = 0
		except sqlalchemy.exc.OperationalError:
			db.get_db_session().rollback()
			print("Error!")
			traceback.print_exc()

	db.get_db_session().commit()